fix merge

This commit is contained in:
Marcin Junczys-Dowmunt 2018-07-27 10:14:21 -07:00
commit dceb7185d8
66 changed files with 2786 additions and 1169 deletions

View File

@ -7,6 +7,41 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
## [Unreleased]
### Added
- Faster training (20-30%) by optimizing gradient popagation of biases
- Returning Moses-style hard alignments during decoding single models, ensembles and n-best
lists
- Hard alignment extraction strategy taking source words that have the
attention value greater than the threshold
- Refactored sync sgd for easier communication and integration with NCCL
- Smaller memory-overhead for sync-sgd
- NCCL integration (version 2.2.13)
### Fixed
- A couple of bugs in "selection" (transpose, shift, cols, rows) operators during
back-prob for a very specific case: one of the operators is the first operator after
a branch, in that case gradient propgation might be interrupted. This did not affect
any of the existing models as such a case was not present, but might have caused
future models to not train properly.
- Bug in mini-batch-fit, tied embeddings would result in identical embeddings in fake
source and target batch. Caused under-estimation of memory usage and re-allocation.
## [1.5.0] - 2018-06-17
### Added
- Average Attention Networks for Transformer model
- 16-bit matrix multiplication on CPU
- Memoization for constant nodes for decoding
- Autotuning for decoding
### Fixed
- GPU decoding optimizations, about 2x faster decoding of transformer models
- Multi-node MPI-based training on GPUs
## [1.4.0] - 2018-03-13
### Added

View File

@ -13,6 +13,7 @@ option(COMPILE_CPU "Compile CPU version" ON)
option(COMPILE_CUDA "Compile GPU version" ON)
option(USE_STATIC_LIBS "Compile GPU version" OFF)
option(USE_CUDNN "Use CUDNN library" OFF)
option(USE_NCCL "Use NCCL library" ON)
option(USE_MPI "Use MPI library" OFF)
# Project versioning
@ -49,6 +50,17 @@ if(CUDA_FOUND)
LIST(APPEND CUDA_NVCC_FLAGS -DCUDNN; )
endif(CUDNN_FOUND)
endif(USE_CUDNN)
if(USE_NCCL)
find_package(NCCL)
if(NCCL_FOUND)
include_directories(${NCCL_INCLUDE_DIR})
set(EXT_LIBS ${EXT_LIBS} ${NCCL_LIBRARIES})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_NCCL")
LIST(APPEND CUDA_NVCC_FLAGS -DUSE_NCCL; )
endif(NCCL_FOUND)
endif(USE_NCCL)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCUDA_FOUND")
list(APPEND CUDA_NVCC_FLAGS -DCUDA_FOUND; )
else(CUDA_FOUND)

View File

@ -3,6 +3,7 @@ Marian
[![Join the chat at https://gitter.im/marian-nmt](https://badges.gitter.im/amunmt/marian.svg)](https://gitter.im/marian-nmt?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
[![Build Status](http://vali.inf.ed.ac.uk/jenkins/buildStatus/icon?job=marian-dev)](http://vali.inf.ed.ac.uk/jenkins/job/marian-dev/)
[![CPU Build Status](http://vali.inf.ed.ac.uk/jenkins/buildStatus/icon?job=marian-dev)](http://vali.inf.ed.ac.uk/jenkins/job/marian-dev-cpu/)
[![Tests Status](http://vali.inf.ed.ac.uk/jenkins/buildStatus/icon?job=marian-regression-tests)](http://vali.inf.ed.ac.uk/jenkins/job/marian-regression-tests/)
[![Twitter](https://img.shields.io/twitter/follow/marian_nmt.svg?style=social&label=Follow)](https://twitter.com/intent/follow?screen_name=marian_nmt)
@ -22,7 +23,7 @@ Named in honour of Marian Rejewski, a Polish mathematician and cryptologist.
cd marian-dev
mkdir -p build
cd build
cmake .. -DCMAKE_BUILD_TYPE=relwithdebinfo
cmake .. -DCMAKE_BUILD_TYPE=Release
make -j
```

View File

@ -1 +1 @@
v1.4.0
v1.5.0

30
cmake/FindNCCL.cmake Normal file
View File

@ -0,0 +1,30 @@
set(NCCL_INC_PATHS
/usr/include
/usr/local/include
/usr/local/cuda/include
$ENV{NCCL_DIR}/include
$ENV{CUDA_TOOLKIT_ROOT_DIRCUDA_ROOT}/include
)
set(NCCL_LIB_PATHS
/lib
/lib64
/usr/lib
/usr/lib64
/usr/local/lib
/usr/local/lib64
/usr/local/cuda/lib64
$ENV{NCCL_DIR}/lib64
$ENV{CUDA_TOOLKIT_ROOT_DIR}/lib64
)
find_path(NCCL_INCLUDE_DIR NAMES nccl.h PATHS ${NCCL_INC_PATHS})
find_library(NCCL_LIBRARIES NAMES nccl PATHS ${NCCL_LIB_PATHS})
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIR NCCL_LIBRARIES)
if (NCCL_FOUND)
message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIR}, library: ${NCCL_LIBRARIES})")
mark_as_advanced(NCCL_INCLUDE_DIR NCCL_LIBRARIES)
endif ()

View File

@ -1,17 +0,0 @@
#!/usr/bin/env python
import os
import sys
sys.path.insert(0, os.path.abspath(os.path.dirname(__file__) + "../../build"))
import libmariannmt as nmt
print >>sys.stderr, "marian-nmt version: ", nmt.version()
if len(sys.argv) == 1:
print >>sys.stderr, "Specify s2s arguments"
exit(1)
nmt.init(' '.join(sys.argv))
for line in sys.stdin:
print nmt.translate([line.rstrip()])

View File

@ -1,53 +0,0 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import os
import argparse
sys.path.insert(0, os.path.abspath(os.path.dirname(__file__) + "./../build"))
import libmariannmt as nmt
from bottle import request, Bottle, abort
app = Bottle()
@app.route('/translate')
def handle_websocket():
wsock = request.environ.get('wsgi.websocket')
if not wsock:
abort(400, 'Expected WebSocket request.')
while True:
try:
message = wsock.receive()
if message is not None:
# force potential unicode to str() for boost conversion
listSentences = str(message).split('\n')
numEle = len(listSentences)
if numEle > 0 and listSentences[numEle - 1] == "":
del listSentences[numEle - 1]
trans = nmt.translate(listSentences)
wsock.send('\n'.join(trans))
except WebSocketError:
break
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("-c", dest="config")
parser.add_argument('-p', dest="port", default=8080, type=int)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
nmt.init("-c {}".format(args.config))
from gevent.pywsgi import WSGIServer
from geventwebsocket import WebSocketError
from geventwebsocket.handler import WebSocketHandler
server = WSGIServer(
("0.0.0.0", args.port), app, handler_class=WebSocketHandler)
server.serve_forever()

View File

@ -37,6 +37,8 @@ add_library(marian STATIC
graph/node_initializers.cpp
layers/convolution.cpp
layers/loss.cpp
layers/weight.cpp
rnn/cells.cpp
rnn/attention.cpp
@ -49,6 +51,7 @@ add_library(marian STATIC
translator/history.cpp
translator/output_collector.cpp
translator/output_printer.cpp
translator/nth_element.cpp
translator/helpers.cpp
translator/scorers.cpp
@ -58,10 +61,13 @@ add_library(marian STATIC
training/graph_group_sync.cpp
training/graph_group_singleton.cpp
training/graph_group_multinode.cpp
training/graph_group_multinode_sync.cpp
training/validator.cpp
training/communicator.cpp
$<TARGET_OBJECTS:libyaml-cpp>
$<TARGET_OBJECTS:SQLiteCpp>)
$<TARGET_OBJECTS:SQLiteCpp>
)
if(CUDA_FOUND)
cuda_add_library(marian_cuda
@ -77,6 +83,7 @@ cuda_add_library(marian_cuda
translator/helpers.cu
training/gradient_dropping/gpu/dropper.cu
training/gradient_dropping/gpu/sparse_algorithm.cu
training/communicator.cu
STATIC)
endif(CUDA_FOUND)
@ -98,14 +105,16 @@ set_target_properties(marian_vocab PROPERTIES OUTPUT_NAME marian-vocab)
set(EXECUTABLES ${EXECUTABLES} marian_train marian_decoder marian_scorer marian_vocab)
# marian.zip and marian.tgz
# This combines marian, marian_decoder in a single ZIP or TAR file for execution in MSFT internal tools FLO and Philly.
# For Philly submission, we need statically-linked versions to deal with library dependencies, so this target is only enabled for static builds.
# This combines marian, marian_decoder in a single ZIP or TAR file for
# execution in MSFT internal tools FLO and Philly.
# For Philly submission, we need statically-linked versions to deal with
# library dependencies, so this target is only enabled for static builds.
if(USE_STATIC_LIBS)
add_custom_command(
OUTPUT "${CMAKE_BINARY_DIR}/marian.zip"
COMMAND zip -v -0 -j "${CMAKE_BINARY_DIR}/marian.zip"
"${CMAKE_BINARY_DIR}/marian"
"${CMAKE_BINARY_DIR}/marian-decoder"
"${CMAKE_BINARY_DIR}/marian-decoder"
"${CMAKE_BINARY_DIR}/marian-scorer"
"${CMAKE_BINARY_DIR}/marian-vocab"
DEPENDS marian_train marian_decoder marian_scorer marian_vocab)
@ -146,10 +155,6 @@ endforeach(exec)
#set_target_properties(align2steps PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
if(PYTHONLIBS_FOUND)
# add_subdirectory(python)
endif(PYTHONLIBS_FOUND)
if(COMPILE_TESTS)
set(CATCH_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/3rd_party)
add_library(Catch INTERFACE)

View File

@ -1,16 +1,17 @@
#include "marian.h"
#include "training/graph_group_async.h"
#include "training/graph_group_multinode.h"
#include "training/graph_group_multinode_sync.h"
#include "training/graph_group_singleton.h"
#include "training/graph_group_sync.h"
#include "training/training.h"
#ifdef CUDA_FOUND
#include "training/graph_group_async_drop.h"
#include "training/graph_group_multinode.h"
#endif
bool configureMPI(int, char**);
bool configureMPI(int, char**, bool);
int main(int argc, char** argv) {
using namespace marian;
@ -19,38 +20,54 @@ int main(int argc, char** argv) {
auto devices = options->getDevices();
if(options->get<bool>("multi-node")) {
ABORT_IF(!configureMPI(argc, argv), "MPI not found.");
ABORT_IF(!configureMPI(argc, argv, options->get<bool>("sync-sgd")),
"MPI not found.");
LOG(warn, "[experimental] Running multi-node training");
New<Train<MultiNodeGraphGroup>>(options)->run();
if(options->get<bool>("sync-sgd")) {
New<Train<MultiNodeGraphGroupSync>>(options)->run();
}
else {
#ifdef CUDA_FOUND
New<Train<MultiNodeGraphGroup>>(options)->run();
#else
ABORT("Asynchronous multi-node training requires CUDA");
#endif
}
} else {
if(devices.size() == 1) {
New<Train<SingletonGraph>>(options)->run();
} else {
if(options->get<bool>("sync-sgd"))
if(options->get<bool>("sync-sgd")) {
New<Train<SyncGraphGroup>>(options)->run();
}
else if(options->get<float>("grad-dropping-rate") > 0.0) {
#ifdef CUDA_FOUND
else if(options->get<float>("grad-dropping-rate") > 0.0)
New<Train<AsyncGraphGroupDrop>>(options)->run();
#else
ABORT("Asynchronous training with gradient dropping requires CUDA");
#endif
else
}
else {
New<Train<AsyncGraphGroup>>(options)->run();
}
}
}
return 0;
}
bool configureMPI(int argc, char** argv) {
bool configureMPI(int argc, char** argv, bool sync) {
bool enable = false;
#if MPI_FOUND
int required_mode = sync ? MPI_THREAD_SERIALIZED : MPI_THREAD_MULTIPLE;
int provided_thread_mode = 0;
MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &provided_thread_mode);
// Enable if occasional truncation errors
MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);
ABORT_IF(
provided_thread_mode < MPI_THREAD_MULTIPLE,
provided_thread_mode < required_mode,
"Your version of MPI does not support multi-threaded communication.");
enable = true;

View File

@ -490,6 +490,10 @@ void ConfigParser::addOptionsTraining(po::options_description& desc) {
->multitoken()
->default_value(std::vector<std::string>({"0"}), "0"),
"GPU ID(s) to use for training")
#ifdef USE_NCCL
("no-nccl", po::value<bool>()->zero_tokens()->default_value(false),
"Disable inter-GPU communication via NCCL")
#endif
#ifdef CUDA_FOUND
("cpu-threads", po::value<size_t>()->default_value(0)->implicit_value(1),
"Use CPU-based computation with this many independent threads, 0 means GPU-based computation")
@ -607,10 +611,6 @@ void ConfigParser::addOptionsTraining(po::options_description& desc) {
("multi-node-overlap", po::value<bool>()
->default_value(true),
"Overlap model computations with MPI communication")
("multi-node-local-optimizers", po::value<bool>()
->zero_tokens()
->default_value(false),
"Enable local optimizers with multi-node. Requires optimizer delay to be turned on.")
;
// clang-format on
desc.add(training);
@ -722,9 +722,10 @@ void ConfigParser::addOptionsTranslate(po::options_description& desc) {
"Display n-best list")
("shortlist", po::value<std::vector<std::string>>()->multitoken(),
"Use softmax shortlist: path first best prune")
("weights", po::value<std::vector<float>>()
->multitoken(),
("weights", po::value<std::vector<float>>()->multitoken(),
"Scorer weights")
("alignment", po::value<float>()->default_value(0.f)->implicit_value(1.f),
"Return word alignments")
// TODO: the options should be available only in server
("port,p", po::value<size_t>()->default_value(8080),
"Port number for web socket server")
@ -1006,7 +1007,10 @@ void ConfigParser::parseOptions(int argc, char** argv, bool doValidate) {
SET_OPTION("multi-node", bool);
SET_OPTION("multi-node-overlap", bool);
SET_OPTION("multi-node-local-optimizers", bool);
#ifdef USE_NCCL
SET_OPTION("no-nccl", bool);
#endif
}
if(mode_ == ConfigMode::rescoring) {
@ -1031,6 +1035,7 @@ void ConfigParser::parseOptions(int argc, char** argv, bool doValidate) {
SET_OPTION("mini-batch-words", int);
SET_OPTION_NONDEFAULT("weights", std::vector<float>);
SET_OPTION_NONDEFAULT("shortlist", std::vector<std::string>);
SET_OPTION("alignment", float);
SET_OPTION("port", size_t);
SET_OPTION("optimize", bool);
SET_OPTION("max-length-factor", float);

View File

@ -207,6 +207,13 @@ public:
return currentBatch_;
}
std::vector<BatchPtr> nextN(size_t num) {
std::vector<BatchPtr> batches;
for(int i = 0; i < num && *this; ++i)
batches.push_back(next());
return batches;
}
void prepare(bool shuffle = true) {
if(shuffle)
data_->shuffle();

View File

@ -172,33 +172,35 @@ public:
* @see marian::data::Batch::split(size_t n)
*/
std::vector<Ptr<SubBatch>> split(size_t n) {
ABORT_IF(size_ == 0, "Encoutered sub-batch size of 0");
std::vector<Ptr<SubBatch>> splits;
size_t subSize = std::ceil(size_ / (float)n);
size_t totSize = size_;
size_t restSize = size_;
int pos = 0;
for(int k = 0; k < n; ++k) {
size_t __size__ = std::min(subSize, totSize);
size_t __size__ = std::min(subSize, restSize);
if(__size__ > 0) {
auto sb = New<SubBatch>(__size__, width_, vocab_);
auto sb = New<SubBatch>(__size__, width_, vocab_);
size_t __words__ = 0;
for(int j = 0; j < width_; ++j) {
for(int i = 0; i < __size__; ++i) {
sb->data()[j * __size__ + i] = indices_[j * size_ + pos + i];
sb->mask()[j * __size__ + i] = mask_[j * size_ + pos + i];
size_t __words__ = 0;
for(int j = 0; j < width_; ++j) {
for(int i = 0; i < __size__; ++i) {
sb->data()[j * __size__ + i] = indices_[j * size_ + pos + i];
sb->mask()[j * __size__ + i] = mask_[j * size_ + pos + i];
if(mask_[j * size_ + pos + i] != 0)
__words__++;
if(mask_[j * size_ + pos + i] != 0)
__words__++;
}
}
sb->setWords(__words__);
splits.push_back(sb);
restSize -= __size__;
pos += __size__;
}
sb->setWords(__words__);
splits.push_back(sb);
totSize -= __size__;
pos += __size__;
}
return splits;
}
@ -260,7 +262,7 @@ public:
* @brief The number of sentences in the batch, target words.
*/
size_t sizeTrg() const { return subBatches_.back()->batchSize(); }
/**
* @brief The number of words for the longest sentence in the batch plus one.
*/
@ -291,17 +293,19 @@ public:
Ptr<Options> options) {
std::vector<Ptr<SubBatch>> batches;
size_t idx = 0;
for(auto len : lengths) {
auto vocab = New<Vocab>();
vocab->createFake();
auto sb = New<SubBatch>(batchSize, len, vocab); // data: gets initialized to 0. No EOS symbol is distinguished.
std::fill(sb->data().begin(), sb->data().end(), idx++); // set word indices to different values to avoid same hashes
std::fill(sb->mask().begin(), sb->mask().end(), 1); // mask: no items ask being masked out
batches.push_back(sb);
}
auto batch = New<CorpusBatch>(batches);
if(!options) return batch;
if(options->has("guided-alignment")) {
@ -331,12 +335,17 @@ public:
* @see marian::data::SubBatch::split(size_t n)
*/
std::vector<Ptr<Batch>> split(size_t n) {
ABORT_IF(size() == 0, "Encoutered batch size of 0");
std::vector<std::vector<Ptr<SubBatch>>> subs;
// split each subbatch separately
std::vector<std::vector<Ptr<SubBatch>>> subs(n);
for(auto subBatch : subBatches_) {
size_t i = 0;
for(auto splitSubBatch : subBatch->split(n))
for(auto splitSubBatch : subBatch->split(n)) {
if(subs.size() <= i)
subs.resize(i + 1);
subs[i++].push_back(splitSubBatch);
}
}
// create batches from split subbatches

View File

@ -99,7 +99,7 @@ void CorpusSQLite::fillSQLite() {
}
}
db_->exec("commit;");
LOG(info, "[sqlite] Inserted {} lines", lines);
LOG(info, "[sqlite] Inserted {} lines", lines - 1);
LOG(info, "[sqlite] Creating primary index");
db_->exec("create unique index idx_line on lines (_id);");
}

View File

@ -1,5 +1,7 @@
#pragma once
#include <cmath>
#include "functional/defs.h"
#include "functional/operands.h"

View File

@ -244,12 +244,13 @@ public:
}
}
void backward() {
void backward(bool zero = true) {
ABORT_IF(topNodes_.size() > 1,
"There are more than one top most node for backward step");
params_->allocateBackward();
params_->set_zero_adjoint();
if(zero)
params_->set_zero_adjoint();
for(auto&& v : topNodes_)
v->init_dependent();
@ -264,7 +265,7 @@ public:
nodesBackward_.pop_back();
for(auto&& child : v->children()) {
if(child->trainable())
if(child->trainable() && child->type() != "param")
child->set_zero_adjoint();
}

View File

@ -313,7 +313,9 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
if(bc != b)
bc = rec2(bc);
std::vector<Expr> nodes = {ac, bc, bias};
int rows = ac->shape().elements() / ac->shape()[-1];
Expr ones = ac->graph()->ones({rows, 1});
std::vector<Expr> nodes = {ac, bc, bias, ones};
return rec2(Expression<AffineNodeOp>(nodes, transA, transB, scale),
true);
};
@ -333,13 +335,16 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
}
else {
// general version, MKL, CBlas or CUDA
// if clipValue > 0, the inputs will be clipped to range [-clipValue, clipValue]
// This is meant to keep values at the same range as used during training when
// optimizing for 8-bit integer products. Likely to be removed in the future
// when we explore better ways to handle this.
std::vector<Expr> nodes = {clip(a, clipValue), clip(b, clipValue), bias};
return Expression<AffineNodeOp>(nodes, transA, transB, scale);
int rows = a->shape().elements() / a->shape()[-1];
Expr ones = a->graph()->ones({rows, 1});
std::vector<Expr> nodes = {clip(a, clipValue), clip(b, clipValue), bias, ones};
return Expression<AffineNodeOp>(nodes, transA, transB, scale);
}
}
@ -462,6 +467,7 @@ Expr shift(Expr a, Shape shift, float padValue) {
//}
#ifdef CUDA_FOUND
#ifdef CUDNN
Expr avg_pooling(Expr x,
int height,
@ -525,5 +531,6 @@ Expr pooling_with_masking(Expr x, Expr mask, int width, bool isEven) {
return Expression<PoolingWithMaskingOp>(x, mask, width, isEven);
}
#endif
#endif
}

View File

@ -106,7 +106,6 @@ Expr flatten_2d(Expr a);
Expr rows(Expr a, const std::vector<size_t>& indices);
Expr cols(Expr a, const std::vector<size_t>& indices);
Expr select(Expr a, int axis, const std::vector<size_t>& indices);
/*********************************************************/

View File

@ -4,9 +4,12 @@
#include "functional/functional.h"
#include "graph/node.h"
#include "tensors/gpu/cudnn_wrappers.h"
#include "tensors/tensor_operators.h"
#ifdef CUDNN
#include "tensors/gpu/cudnn_wrappers.h"
#endif
namespace marian {
class DotNodeOp : public NaryNodeOp {
@ -167,15 +170,17 @@ public:
NodeOps forwardOps() {
using namespace functional;
return {
NodeOp(ProdWithBias(val_,
child(0)->val(),
child(1)->val(),
child(2)->val(),
transA_,
transB_,
0.f,
scalar_))
NodeOp(Prod(val_,
child(0)->val(),
child(1)->val(),
transA_, transB_, 0.f, scalar_);
Prod(val_,
child(3)->val(),
child(2)->val(),
false, false, 1.f, 1.f)
)
};
}
@ -202,7 +207,12 @@ public:
false,
1.0,
scalar_)),
NodeOp(Add(_1, child(2)->grad(), adj_))};
NodeOp(Prod(child(2)->grad(),
child(3)->val(), adj_,
true, false,
0.f, 1.f))
//NodeOp(Add(_1, child(2)->grad(), adj_))
};
if(transA_ && !transB_)
return {NodeOp(Prod(child(0)->grad(),
@ -219,7 +229,12 @@ public:
false,
1.0,
scalar_)),
NodeOp(Add(_1, child(2)->grad(), adj_))};
NodeOp(Prod(child(2)->grad(),
child(3)->val(), adj_,
true, false,
0.f, 1.f))
//NodeOp(Add(_1, child(2)->grad(), adj_))
};
if(transA_ && transB_)
return {NodeOp(Prod(child(0)->grad(),
@ -236,7 +251,12 @@ public:
true,
1.0,
scalar_)),
NodeOp(Add(_1, child(2)->grad(), adj_))};
NodeOp(Prod(child(2)->grad(),
child(3)->val(), adj_,
true, false,
0.f, 1.f))
//NodeOp(Add(_1, child(2)->grad(), adj_))
};
return {NodeOp(Prod(child(0)->grad(),
adj_,
@ -252,7 +272,12 @@ public:
false,
1.0,
scalar_)),
NodeOp(Add(_1, child(2)->grad(), adj_))};
NodeOp(Prod(child(2)->grad(),
child(3)->val(), adj_,
true, false,
0.f, 1.f))
//NodeOp(Add(_1, child(2)->grad(), adj_))
};
}
const std::string type() { return "affine"; }
@ -294,6 +319,7 @@ public:
NodeOps forwardOps() {
// C = alpha * dot(op(A), op(B))
return {NodeOp(ProdBatched(val_,
graph()->allocator(),
child(0)->val(),
child(1)->val(),
transA_,
@ -311,6 +337,7 @@ public:
if(!transA_ && transB_)
return {NodeOp(ProdBatched(child(0)->grad(),
graph()->allocator(),
adj_,
child(1)->val(),
false,
@ -318,6 +345,7 @@ public:
1.0,
scalar_)),
NodeOp(ProdBatched(child(1)->grad(),
graph()->allocator(),
adj_,
child(0)->val(),
true,
@ -327,6 +355,7 @@ public:
if(transA_ && !transB_)
return {NodeOp(ProdBatched(child(0)->grad(),
graph()->allocator(),
child(1)->val(),
adj_,
false,
@ -334,6 +363,7 @@ public:
1.0,
scalar_)),
NodeOp(ProdBatched(child(1)->grad(),
graph()->allocator(),
child(0)->val(),
adj_,
false,
@ -343,6 +373,7 @@ public:
if(transA_ && transB_)
return {NodeOp(ProdBatched(child(0)->grad(),
graph()->allocator(),
child(1)->val(),
adj_,
true,
@ -350,6 +381,7 @@ public:
1.0,
scalar_)),
NodeOp(ProdBatched(child(1)->grad(),
graph()->allocator(),
adj_,
child(0)->val(),
true,
@ -358,6 +390,7 @@ public:
scalar_))};
return {NodeOp(ProdBatched(child(0)->grad(),
graph()->allocator(),
adj_,
child(1)->val(),
false,
@ -365,6 +398,7 @@ public:
1.0,
scalar_)),
NodeOp(ProdBatched(child(1)->grad(),
graph()->allocator(),
child(0)->val(),
adj_,
true,
@ -766,6 +800,7 @@ struct HighwayNodeOp : public NaryNodeOp {
const std::string type() { return "highway"; }
};
#ifdef CUDNN
class ConvolutionOp : public NaryNodeOp {
public:
ConvolutionOp(const std::vector<Expr>& nodes,
@ -802,4 +837,5 @@ public:
protected:
ConvolutionWrapper conv_;
};
#endif
}

View File

@ -7,7 +7,9 @@
#include "graph/node.h"
#include "tensors/tensor_operators.h"
//#include "tensors/gpu/cudnn_wrappers.h"
#ifdef CUDNN
#include "tensors/gpu/cudnn_wrappers.h"
#endif
namespace marian {
@ -815,7 +817,7 @@ struct TransposeNodeOp : public UnaryNodeOp {
}
NodeOps backwardOps() {
return {NodeOp(TransposeND(child(0)->grad(), adj_, axes_))};
return {NodeOp(TransposeNDGrad(child(0)->grad(), adj_, axes_))};
}
template <class... Args>
@ -1009,7 +1011,9 @@ struct ShiftNodeOp : public UnaryNodeOp {
}
NodeOps backwardOps() {
return {NodeOp(Shift(child(0)->grad(), adj_, shift_, /*padValue=*/0.f, /*invert=*/true))};
// last parameter beta=1 says to use += (out = in + beta * out)
// @TODO: check need for padValue_
return {NodeOp(ShiftGrad(child(0)->grad(), adj_, shift_, true))};
}
const std::string type() { return "shift"; }
@ -1076,6 +1080,7 @@ struct ShiftNodeOp : public UnaryNodeOp {
// Ptr<sparse::CSR> lf_;
//};
#ifdef CUDNN
class PoolingOp : public UnaryNodeOp {
public:
PoolingOp(Expr x,
@ -1109,6 +1114,7 @@ public:
protected:
PoolingWrapper pooling_;
};
#endif
class PoolingWithMaskingOp : public UnaryNodeOp {
public:

View File

@ -2,6 +2,8 @@
#include "graph/node_operators_binary.h"
namespace marian {
#ifdef CUDNN
Convolution::Convolution(Ptr<ExpressionGraph> graph) : Factory(graph) {}
Expr Convolution::apply(Expr x) {
@ -29,4 +31,6 @@ Expr Convolution::apply(const std::vector<Expr>&) {
ABORT("Can't apply convolution on many inputs at once");
return nullptr;
}
#endif
}

View File

@ -7,6 +7,7 @@
namespace marian {
#ifdef CUDNN
class Convolution : public Factory {
protected:
Ptr<Options> getOptions() { return options_; }
@ -82,4 +83,6 @@ protected:
std::vector<int> kernelNums_;
int stride_;
};
#endif
}

View File

@ -2,11 +2,14 @@
#include "marian.h"
#include "layers/factory.h"
#include "data/shortlist.h"
#include "layers/factory.h"
namespace marian {
namespace mlp {
/**
* @brief Activation functions
*/
enum struct act : int { linear, tanh, sigmoid, ReLU, LeakyReLU, PReLU, swish };
}
}
@ -64,12 +67,9 @@ public:
if(inputs.size() > 1)
num = std::to_string(i);
Expr W = g->param(name + "_W" + num,
{in->shape()[-1], dim},
inits::glorot_uniform);
Expr b = g->param(name + "_b" + num,
{1, dim},
inits::zeros);
Expr W = g->param(
name + "_W" + num, {in->shape()[-1], dim}, inits::glorot_uniform);
Expr b = g->param(name + "_b" + num, {1, dim}, inits::zeros);
if(useLayerNorm) {
if(useNematusNorm) {
@ -82,9 +82,8 @@ public:
outputs.push_back(layerNorm(affine(in, W, b), ln_s, ln_b, NEMATUS_LN_EPS));
} else {
auto gamma = g->param(name + "_gamma" + num,
{1, dim},
inits::from_value(1.0));
auto gamma = g->param(
name + "_gamma" + num, {1, dim}, inits::from_value(1.0));
outputs.push_back(layerNorm(dot(in, W), gamma, b));
}
@ -107,9 +106,7 @@ public:
}
};
Expr apply(Expr input) {
return apply(std::vector<Expr>({input}));
}
Expr apply(Expr input) { return apply(std::vector<Expr>({input})); }
};
class Output : public Layer {
@ -129,9 +126,7 @@ public:
tiedParams_[param] = graph_->get(tied);
}
void set_shortlist(Ptr<data::Shortlist> shortlist) {
shortlist_ = shortlist;
}
void set_shortlist(Ptr<data::Shortlist> shortlist) { shortlist_ = shortlist; }
Expr apply(Expr input) {
if(!W_) {
@ -146,15 +141,13 @@ public:
W_ = rows(W_, shortlist_->indices());
} else {
W_ = graph_->param(name + "_" + nameW,
{input->shape()[-1], dim},
inits::glorot_uniform);
{input->shape()[-1], dim},
inits::glorot_uniform);
if(shortlist_)
W_ = cols(W_, shortlist_->indices());
}
b_ = graph_->param(name + "_b",
{1, dim},
inits::zeros);
b_ = graph_->param(name + "_b", {1, dim}, inits::zeros);
if(shortlist_)
b_ = cols(b_, shortlist_->indices());
}
@ -165,10 +158,8 @@ public:
virtual Expr apply(const std::vector<Expr>& inputs) {
ABORT("Not implemented");
};
};
} // namespace mlp
struct EmbeddingFactory : public Factory {
@ -195,51 +186,4 @@ struct EmbeddingFactory : public Factory {
};
typedef Accumulator<EmbeddingFactory> embedding;
static inline Expr Cost(Expr logits,
Expr indices,
Expr mask,
std::string costType = "cross-entropy",
float smoothing = 0,
Expr weights = nullptr) {
using namespace keywords;
auto ce = cross_entropy(logits, indices);
if(weights)
ce = weights * ce;
if(smoothing > 0) {
// @TODO: add this to CE kernels instead
auto ceq = mean(logsoftmax(logits), axis = -1);
ce = (1 - smoothing) * ce - smoothing * ceq;
}
if(mask)
ce = ce * mask;
auto costSum = sum(ce, axis = -3);
Expr cost;
// axes:
// - time axis (words): -3
// - batch axis (sentences): -2
if(costType == "ce-mean"
|| costType
== "cross-entropy") { // sum over words; average over sentences
cost = mean(costSum, axis = -2);
} else if(costType == "ce-mean-words") { // average over target tokens
cost = sum(costSum, axis = -2) / sum(sum(mask, axis = -3), axis = -2);
} else if(costType == "ce-sum") { // sum over target tokens
cost = sum(costSum, axis = -2);
} else if(costType == "perplexity") { // ==exp('ce-mean-words')
cost = exp(sum(costSum, axis = -2) / sum(sum(mask, axis = -3), axis = -2));
} else if(costType == "ce-rescore") { // sum over words, keep batch axis
cost = -costSum;
} else { // same as ce-mean
cost = mean(costSum, axis = -2);
}
return cost;
}
}

94
src/layers/loss.cpp Normal file
View File

@ -0,0 +1,94 @@
#include "layers/loss.h"
namespace marian {
Ptr<LossBase> LossFactory(Ptr<Options> options, bool inference) {
float smoothing = inference ? 0.f : options->get<float>("label-smoothing");
std::string costType = options->get<std::string>("cost-type", "ce-mean");
if(costType == "ce-mean" || costType == "cross-entropy") {
return New<CrossEntropyMeanLoss>(smoothing);
} else if(costType == "ce-mean-words") {
return New<CrossEntropyMeanWordsLoss>(smoothing);
} else if(costType == "ce-sum") {
return New<CrossEntropySumLoss>(smoothing);
} else if(costType == "perplexity") {
return New<PerplexityLoss>(smoothing);
} else if(costType == "ce-rescore") {
return New<CrossEntropyRescoreLoss>(smoothing);
} else { // same as ce-mean
return New<CrossEntropyMeanLoss>(smoothing);
}
}
Expr LossBase::getCrossEntropy(Expr logits,
Expr indices,
Expr mask,
Expr weights) {
using namespace keywords;
auto ce = cross_entropy(logits, indices);
if(smoothing_ > 0) {
// @TODO: add this to CE kernels instead
auto ceq = mean(logsoftmax(logits), axis = -1);
ce = (1 - smoothing_) * ce - smoothing_ * ceq;
}
if(mask)
ce = ce * mask;
if(weights)
ce = ce * weights;
return ce;
}
Expr CrossEntropyMeanLoss::getCost(Expr logits,
Expr indices,
Expr mask,
Expr weights) {
using namespace keywords;
auto ce = getCrossEntropy(logits, indices, mask, weights);
// Time axis (words): -3
// Batch axis (sentences): -2
return mean(sum(ce, axis = -3), axis = -2);
}
Expr CrossEntropyMeanWordsLoss::getCost(Expr logits,
Expr indices,
Expr mask,
Expr weights) {
using namespace keywords;
auto ce = getCrossEntropy(logits, indices, mask, weights);
return sum(sum(ce, axis = -3), axis = -2)
/ sum(sum(mask, axis = -3), axis = -2);
}
Expr CrossEntropySumLoss::getCost(Expr logits,
Expr indices,
Expr mask,
Expr weights) {
using namespace keywords;
auto ce = getCrossEntropy(logits, indices, mask, weights);
return sum(sum(ce, axis = -3), axis = -2);
}
Expr PerplexityLoss::getCost(Expr logits,
Expr indices,
Expr mask,
Expr weights) {
using namespace keywords;
auto ce = getCrossEntropy(logits, indices, mask, weights);
return exp(sum(sum(ce, axis = -3), axis = -2)
/ sum(sum(mask, axis = -3), axis = -2));
}
Expr CrossEntropyRescoreLoss::getCost(Expr logits,
Expr indices,
Expr mask,
Expr weights) {
using namespace keywords;
auto ce = getCrossEntropy(logits, indices, mask, weights);
return -sum(ce, axis = -3);
}
}

70
src/layers/loss.h Normal file
View File

@ -0,0 +1,70 @@
#pragma once
#include "marian.h"
namespace marian {
class LossBase {
protected:
float smoothing_;
public:
explicit LossBase(float smoothing = 0) : smoothing_(smoothing){};
Expr getCrossEntropy(Expr logits, Expr indices, Expr mask, Expr weights);
virtual Expr getCost(Expr logits,
Expr indices,
Expr mask,
Expr weights = nullptr)
= 0;
};
/*
* @brief The cross entropy loss function
*
* A sum over words and average over sentences
*/
class CrossEntropyMeanLoss : public LossBase {
public:
explicit CrossEntropyMeanLoss(float smoothing = 0) : LossBase(smoothing){};
Expr getCost(Expr logits, Expr indices, Expr mask, Expr weights);
};
/*
* @brief The cross entropy loss function as an average over target tokens
*/
class CrossEntropyMeanWordsLoss : public LossBase {
public:
explicit CrossEntropyMeanWordsLoss(float smoothing = 0)
: LossBase(smoothing){};
Expr getCost(Expr logits, Expr indices, Expr mask, Expr weights);
};
/*
* @brief The cross entropy loss function as a sum over target tokens
*/
class CrossEntropySumLoss : public LossBase {
public:
explicit CrossEntropySumLoss(float smoothing = 0) : LossBase(smoothing){};
Expr getCost(Expr logits, Expr indices, Expr mask, Expr weights);
};
/*
* @brief The perplexity loss function
*/
class PerplexityLoss : public LossBase {
public:
explicit PerplexityLoss(float smoothing = 0) : LossBase(smoothing){};
Expr getCost(Expr logits, Expr indices, Expr mask, Expr weights);
};
/*
* @brief The cross entropy loss function that keeps sentence-level costs
*/
class CrossEntropyRescoreLoss : public LossBase {
public:
explicit CrossEntropyRescoreLoss(float smoothing = 0) : LossBase(smoothing){};
Expr getCost(Expr logits, Expr indices, Expr mask, Expr weights);
};
Ptr<LossBase> LossFactory(Ptr<Options> options, bool inference);
}

21
src/layers/weight.cpp Normal file
View File

@ -0,0 +1,21 @@
#include "layers/weight.h"
namespace marian {
Ptr<WeightingBase> WeightingFactory(Ptr<Options> options) {
if(options->has("data-weighting"))
return New<DataWeighting>(options->get<std::string>("data-weighting-type"));
}
Expr DataWeighting::getWeights(Ptr<ExpressionGraph> graph,
Ptr<data::CorpusBatch> batch) {
ABORT_IF(batch->getDataWeights().empty(),
"Vector of weights is unexpectedly empty!");
bool sentenceWeighting = weightingType_ == "sentence";
int dimBatch = batch->size();
int dimWords = sentenceWeighting ? 1 : batch->back()->batchWidth();
auto weights = graph->constant({1, dimWords, dimBatch, 1},
inits::from_vector(batch->getDataWeights()));
return weights;
}
}

33
src/layers/weight.h Normal file
View File

@ -0,0 +1,33 @@
#pragma once
#include "common/options.h"
#include "data/corpus.h"
#include "graph/expression_graph.h"
#include "graph/expression_operators.h"
#include "graph/node_initializers.h"
namespace marian {
class WeightingBase {
public:
WeightingBase(){};
virtual Expr getWeights(Ptr<ExpressionGraph> graph,
Ptr<data::CorpusBatch> batch)
= 0;
virtual void debugWeighting(std::vector<float> weightedMask,
std::vector<float> freqMask,
Ptr<data::CorpusBatch> batch){};
};
class DataWeighting : public WeightingBase {
protected:
std::string weightingType_;
public:
DataWeighting(std::string weightingType)
: WeightingBase(), weightingType_(weightingType){};
Expr getWeights(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch> batch);
};
Ptr<WeightingBase> WeightingFactory(Ptr<Options> options);
}

View File

@ -1,8 +1,10 @@
#pragma once
#include "models/encoder_decoder.h"
#include "layers/generic.h"
#include "layers/guided_alignment.h"
#include "layers/loss.h"
#include "layers/weight.h"
#include "models/encoder_decoder.h"
namespace marian {
namespace models {
@ -12,58 +14,57 @@ public:
virtual Expr apply(Ptr<ModelBase> model,
Ptr<ExpressionGraph> graph,
Ptr<data::Batch> batch,
bool clearGraph = true) = 0;
bool clearGraph = true)
= 0;
};
class EncoderDecoderCE : public CostBase {
protected:
Ptr<Options> options_;
bool inference_{false};
bool toBeWeighted_{false};
Ptr<LossBase> loss_;
Ptr<WeightingBase> weighter_;
public:
EncoderDecoderCE(Ptr<Options> options)
: options_(options) {}
: options_(options), inference_(options->get<bool>("inference", false)) {
loss_ = LossFactory(options_, inference_);
toBeWeighted_ = (options_->has("data-weighting") && !inference_)
|| (options_->has("dynamic-weighting")
&& options_->get<bool>("dynamic-weighting")
&& !inference_);
if(toBeWeighted_)
weighter_ = WeightingFactory(options_);
}
Expr apply(Ptr<ModelBase> model,
Ptr<ExpressionGraph> graph,
Ptr<data::Batch> batch,
bool clearGraph = true) {
auto encdec = std::static_pointer_cast<EncoderDecoder>(model);
auto corpusBatch = std::static_pointer_cast<data::CorpusBatch>(batch);
auto state = encdec->stepAll(graph, corpusBatch, clearGraph);
std::string costType = options_->get<std::string>("cost-type");
bool inference = options_->get<bool>("inference", false);
float ls = inference ? 0.f : options_->get<float>("label-smoothing");
float ls = inference_ ? 0.f : options_->get<float>("label-smoothing");
Expr weights;
Expr cost;
bool sentenceWeighting = false;
if(options_->has("data-weighting") && !inference) {
ABORT_IF(corpusBatch->getDataWeights().empty(),
"Vector of weights is unexpectedly empty!");
sentenceWeighting
= options_->get<std::string>("data-weighting-type") == "sentence";
int dimBatch = corpusBatch->size();
int dimWords = sentenceWeighting ? 1 : corpusBatch->back()->batchWidth();
weights = graph->constant({1, dimWords, dimBatch, 1},
inits::from_vector(corpusBatch->getDataWeights()));
if(toBeWeighted_) {
weights = weighter_->getWeights(graph, corpusBatch);
}
auto cost
= Cost(state->getProbs(),
state->getTargetIndices(),
state->getTargetMask(),
costType,
ls,
weights);
cost = loss_->getCost(state->getProbs(),
state->getTargetIndices(),
state->getTargetMask(),
weights);
if(options_->has("guided-alignment") && !inference) {
if(options_->has("guided-alignment") && !inference_) {
auto alignments = encdec->getDecoders()[0]->getAlignments();
ABORT_IF(alignments.empty(), "Model does not seem to support alignments");
@ -73,8 +74,6 @@ public:
} else {
return cost;
}
return cost;
}
};
@ -85,7 +84,7 @@ protected:
public:
Trainer(Ptr<ModelBase> model, Ptr<CostBase> cost)
: model_(model), cost_(cost) {}
: model_(model), cost_(cost) {}
Ptr<ModelBase> getModel() { return model_; }
@ -104,16 +103,10 @@ public:
virtual Expr build(Ptr<ExpressionGraph> graph,
Ptr<data::Batch> batch,
bool clearGraph = true) {
return cost_->apply(model_,
graph,
batch,
clearGraph);
};
virtual void clear(Ptr<ExpressionGraph> graph) {
model_->clear(graph);
return cost_->apply(model_, graph, batch, clearGraph);
};
virtual void clear(Ptr<ExpressionGraph> graph) { model_->clear(graph); };
};
typedef Trainer Scorer;
@ -138,11 +131,11 @@ protected:
public:
Stepwise(Ptr<EncoderDecoderBase> encdec, Ptr<CostStep> cost)
: encdec_(encdec), cost_(cost) {}
: encdec_(encdec), cost_(cost) {}
virtual void load(Ptr<ExpressionGraph> graph,
const std::string& name,
bool markedReloaded = true) {
bool markedReloaded = true) {
encdec_->load(graph, name, markedReloaded);
}
@ -152,9 +145,7 @@ public:
encdec_->save(graph, name, saveTranslatorConfig);
}
virtual void clear(Ptr<ExpressionGraph> graph) {
encdec_->clear(graph);
}
virtual void clear(Ptr<ExpressionGraph> graph) { encdec_->clear(graph); }
virtual Expr build(Ptr<ExpressionGraph> graph,
Ptr<data::Batch> batch,
@ -174,7 +165,8 @@ public:
const std::vector<size_t>& embIndices,
int dimBatch,
int beamSize) {
auto nextState = encdec_->step(graph, state, hypIndices, embIndices, dimBatch, beamSize);
auto nextState = encdec_->step(
graph, state, hypIndices, embIndices, dimBatch, beamSize);
return cost_->apply(nextState);
}
@ -185,11 +177,10 @@ public:
return nullptr;
}
virtual Ptr<Options> getOptions() {
return encdec_->getOptions();
};
virtual Ptr<Options> getOptions() { return encdec_->getOptions(); };
virtual void setShortlistGenerator(Ptr<data::ShortlistGenerator> shortlistGenerator) {
virtual void setShortlistGenerator(
Ptr<data::ShortlistGenerator> shortlistGenerator) {
encdec_->setShortlistGenerator(shortlistGenerator);
};
@ -197,10 +188,14 @@ public:
return encdec_->getShortlist();
};
virtual std::vector<float> getAlignment() {
return encdec_->getAlignment();
}
};
static Ptr<ModelBase> add_cost(Ptr<EncoderDecoder> encdec, Ptr<Options> options) {
switch (options->get<usage>("usage", usage::raw)) {
static Ptr<ModelBase> add_cost(Ptr<EncoderDecoder> encdec,
Ptr<Options> options) {
switch(options->get<usage>("usage", usage::raw)) {
case usage::training:
return New<Trainer>(encdec, New<EncoderDecoderCE>(options));
case usage::scoring:
@ -208,10 +203,8 @@ static Ptr<ModelBase> add_cost(Ptr<EncoderDecoder> encdec, Ptr<Options> options)
case usage::translation:
return New<Stepwise>(encdec, New<LogsoftmaxStep>());
case usage::raw:
default:
return encdec;
default: return encdec;
}
}
}
}

View File

@ -44,6 +44,8 @@ public:
virtual void setShortlistGenerator(Ptr<data::ShortlistGenerator> shortlistGenerator) = 0;
virtual Ptr<data::Shortlist> getShortlist() = 0;
virtual std::vector<float> getAlignment() = 0;
};
class EncoderDecoder : public EncoderDecoderBase {
@ -113,6 +115,12 @@ public:
return decoders_[0]->getShortlist();
};
virtual std::vector<float> getAlignment() {
std::vector<float> softAlign;
decoders_[0]->getAlignments()[0]->val()->get(softAlign);
return softAlign;
};
/*********************************************************************/
virtual Ptr<DecoderState> startState(Ptr<ExpressionGraph> graph,

View File

@ -21,6 +21,10 @@ public:
ABORT_IF(options_->get<std::string>("dec-cell") != "gru-nematus",
"--type nematus does not currently support other rnn cells "
"than gru-nematus, use --type s2s");
ABORT_IF(options_->get<int>("dec-cell-high-depth") > 1,
"--type nematus does not currently support "
"--dec-cell-high-depth > 1, use --type s2s");
}
void load(Ptr<ExpressionGraph> graph,

View File

@ -1,42 +0,0 @@
cuda_add_library(pymarian SHARED
mariannmt.cpp
../3rd_party/cnpy/cnpy.cpp
../3rd_party/exception.cpp
../3rd_party/svd/svd.cpp
../graph/expression_graph.cpp
../graph/expression_operators.cu
../graph/node.cu
../graph/node_operators.cu
../tensors/tensor.cu
../tensors/device.cpp
../kernels/tensor_operators.cu
../tensors/gpu/dropout.cu
../tensors/cpu/dropout.cpp
../kernels/sparse.cu
#../layers/param_initializers.cu
../rnn/attention.cu
../rnn/cells.cu
#../optimizers/clippers.cu
#../optimizers/optimizers.cu
../common/utils.cpp
../common/logging.cpp
../common/config.cpp
../common/config_parser.cpp
../translator/history.cpp
../translator/output_collector.cpp
../translator/nth_element.cu
../translator/helpers.cu
../data/vocab.cpp
../data/corpus.cpp
../data/text_input.cpp
#../rescorer/score_collector.cpp
$<TARGET_OBJECTS:libyaml-cpp>
)
set_target_properties(pymarian PROPERTIES EXCLUDE_FROM_ALL 1)
set_target_properties(pymarian PROPERTIES OUTPUT_NAME mariannmt)
set_target_properties(pymarian PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
set_target_properties(pymarian PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
target_link_libraries(pymarian ${EXT_LIBS} marian)
cuda_add_cublas_to_target(pymarian)

View File

@ -1,44 +0,0 @@
#include <cstdlib>
#include <iostream>
#include <string>
#include <boost/python.hpp>
#include "common/utils.h"
#include "common/version.h"
#include "translator/beam_search.h"
#include "translator/translator.h"
using namespace marian;
Ptr<TranslateServiceMultiGPU<BeamSearch>> task;
void init(const std::string& argopts) {
auto options = New<Config>(argopts, ConfigMode::translating);
task = New<TranslateServiceMultiGPU<BeamSearch>>(options);
LOG(info, "Translator initialized");
}
boost::python::list translate(boost::python::list& pyinput) {
std::vector<std::string> input;
for(int i = 0; i < boost::python::len(pyinput); ++i) {
input.emplace_back(
boost::python::extract<std::string>(boost::python::object(pyinput[i])));
}
auto output = task->run(input);
boost::python::list pyoutput;
pyoutput.append(Join(output, "\n"));
return pyoutput;
}
std::string version() {
return PROJECT_VERSION;
}
BOOST_PYTHON_MODULE(libmariannmt) {
boost::python::def("init", init);
boost::python::def("translate", translate);
boost::python::def("version", version);
}

View File

@ -16,9 +16,27 @@
namespace marian {
class AllocationException : public std::exception {
private:
char* message_;
public:
virtual const char* what() const throw() {
return "Memory re-allocation attempted";
AllocationException(size_t available, size_t asked) {
std::string mstr = "Attempted allocation of "
+ std::to_string(asked)
+ ", but only "
+ std::to_string(available)
+ " free";
message_ = new char[mstr.size() + 1];
std::copy(mstr.begin(), mstr.end(), message_);
}
~AllocationException() {
delete[] message_;
}
virtual const char* what() const noexcept {
return message_;
}
};
@ -111,7 +129,7 @@ private:
auto it = std::lower_bound(gaps_.begin(), gaps_.end(), Gap(nullptr, size));
if(throw_ && it == gaps_.end()) {
throw AllocationException();
throw AllocationException(available_, size);
}
while(it == gaps_.end()) {
@ -119,8 +137,11 @@ private:
it = std::lower_bound(gaps_.begin(), gaps_.end(), Gap(nullptr, size));
}
available_ -= it->size();
return *it;
Gap gap = *it;
gaps_.erase(it);
available_ -= gap.size();
return gap;
}
void insertGap(Gap gap, bool consolidate = true) {
@ -186,7 +207,6 @@ public:
bytes = align(bytes);
Gap gap = getGap(bytes);
gaps_.erase(gap);
if(gap.size() > bytes) {
insertGap(gap.rest(bytes), false);
}

View File

@ -23,9 +23,9 @@ template <size_t I = 0> struct E {
functional::Array<functional::Tensor<float>, K>& tensors,
functional::Array<int, K> indices) {
auto& shape = tensors[0].shape();
const auto& shape = tensors[0].shape();
// loop for outer-most dimension
// loop over outer-most dimension
for(int i = 0; i < shape[I]; ++i) {
// call loop for next-inner dimension
@ -66,7 +66,7 @@ void Element(const Functor& functor, marian::Tensor out, Tensors... tensors) {
// call elementwise operation going from outer-most dimension
// to inner-most element.
E<>::element(functor, gTensors, indices);
E<0>::element(functor, gTensors, indices);
}
}

View File

@ -95,8 +95,9 @@ void Prod(marian::Tensor C,
}
void ProdBatched(marian::Tensor C,
const marian::Tensor& A,
const marian::Tensor& B,
Ptr<Allocator> allocator,
const marian::Tensor A,
const marian::Tensor B,
bool transA,
bool transB,
float beta,
@ -128,30 +129,21 @@ void ProdBatched(marian::Tensor C,
auto strideA = batchA == 1 ? 0 : m * k;
auto strideC = n * m;
int steps = std::max(batchA, batchB);
int offsetA = 0;
int offsetB = 0;
int offsetC = 0;
for(int i = 0; i < steps; ++i) {
int batchC = std::max(batchA, batchB);
for(int i = 0; i < batchC; ++i) {
sgemm(transA,
transB,
m,
n,
k,
alpha,
A->data() + offsetA,
A->data() + (i % batchA) * strideA,
lda,
B->data() + offsetB,
B->data() + (i % batchB) * strideB,
ldb,
beta,
C->data() + offsetC,
C->data() + i * strideC,
ldc);
offsetA += strideA;
offsetB += strideB;
offsetC += strideC;
}
#else
ABORT("Not implemented!");

View File

@ -50,12 +50,13 @@ inline void gInsertCols(float* out,
size_t cols_out,
size_t cols_in,
size_t offset_out,
size_t offset_in) {
size_t offset_in,
float beta) {
for(int j = 0; j < rows; ++j) {
float* rowOut = out + j * cols_out + offset_out;
const float* rowIn = in + j * cols_in + offset_in;
for(int i = 0; i < cols; ++i) {
rowOut[i] = rowIn[i];
rowOut[i] = rowIn[i] + beta * rowOut[i];
}
}
}
@ -71,7 +72,7 @@ void Concatenate1(Tensor out, const std::vector<Tensor>& inputs) {
"First dimension must be equal");
int cols_in = in->shape().back();
cpu::gInsertCols(
out->data(), in->data(), rows, cols_in, cols_out, cols_in, offset, 0);
out->data(), in->data(), rows, cols_in, cols_out, cols_in, offset, 0, 0);
offset += cols_in;
}
}
@ -91,8 +92,11 @@ void Split1(std::vector<Tensor>& outputs, const Tensor in) {
ABORT_IF(rows != out->shape().elements() / out->shape().back(),
"First dimension must be equal");
int cols_out = out->shape().back();
// set last parameter to 1 to enable += instead of =
// @TODO: do this in a more principled ways accross all/most kernels
cpu::gInsertCols(
out->data(), in->data(), rows, cols_out, cols_out, cols_in, 0, offset);
out->data(), in->data(), rows, cols_out, cols_out, cols_in, 0, offset, 1);
offset += cols_out;
}
}
@ -108,9 +112,17 @@ void SplitCont(std::vector<Tensor>& outputs, const Tensor in, int axis) {
size_t size = out->shape().elements() / step;
size_t offset2 = i * size;
std::copy(in->data() + offset1,
in->data() + offset1 + size,
out->data() + offset2);
// BUG: This overwrites gradients!
//std::copy(in->data() + offset1,
// in->data() + offset1 + size,
// out->data() + offset2);
// Fixes gradient problem, @TODO: check performance
std::transform(in->data() + offset1,
in->data() + offset1 + size,
out->data() + offset2,
out->data() + offset2,
[](float a, float b){ return a + b; });
offset1 += size;
}
@ -124,6 +136,7 @@ void Deconcatenate(std::vector<Tensor>& outputs, const Tensor in, int ax) {
SplitCont(outputs, in, ax);
}
template <bool add>
void Transpose0213(Tensor out, Tensor in) {
int cols = in->shape()[-1];
int rows = in->shape().elements() / in->shape()[-1];
@ -141,7 +154,15 @@ void Transpose0213(Tensor out, Tensor in) {
const float* inRow = in->data() + src * cols ;
float* outRow = out->data() + dst * cols;
std::copy(inRow, inRow + cols, outRow);
if(!add) {
// mostly for fast forward computation
std::copy(inRow, inRow + cols, outRow);
}
else {
for(int i = 0; i < cols; ++i) {
outRow[i] += inRow[i];
}
}
}
}
}
@ -186,6 +207,7 @@ void Transpose10(Tensor out, const Tensor in) {
}
// @TODO: optimize this, currently it's quite horrible
template <bool add>
void TransposeGeneric(Tensor out, Tensor in, const std::vector<int>& vAxis) {
functional::Array<int, functional::Shape::size()> permute;
int diff = functional::Shape::size() - vAxis.size();
@ -207,19 +229,29 @@ void TransposeGeneric(Tensor out, Tensor in, const std::vector<int>& vAxis) {
gOut.shape().dims(index, oDims);
for(int i = 0; i < N; ++i)
pDims[permute[i]] = oDims[i];
gOut[index] = gIn[pDims];
if(add)
gOut[index] += gIn[pDims];
else
gOut[index] = gIn[pDims];
}
}
void TransposeND(Tensor out, Tensor in, const std::vector<int>& vAxis) {
if(vAxis == std::vector<int>({0, 2, 1, 3}))
Transpose0213(out, in);
else if(vAxis == std::vector<int>({1, 0})
&& in->shape()[-1] % 16 == 0
Transpose0213<false>(out, in);
else if(vAxis == std::vector<int>({1, 0})
&& in->shape()[-1] % 16 == 0
&& in->shape()[-2] % 16 == 0)
Transpose10(out, in);
else
TransposeGeneric(out, in, vAxis);
TransposeGeneric<false>(out, in, vAxis);
}
void TransposeNDGrad(Tensor out, Tensor in, const std::vector<int>& vAxis) {
if(vAxis == std::vector<int>({0, 2, 1, 3}))
Transpose0213<true>(out, in);
else
TransposeGeneric<true>(out, in, vAxis);
}
void Softmax(Tensor out_, Tensor in_, Tensor mask_) {
@ -412,9 +444,8 @@ void PasteCols(Tensor out_,
const float* rowIn = in + j * colsIn;
float* rowOut = out + j * colsOut;
// @TODO: should this be a sum?
for(int i = 0; i < colsIn; ++i) {
rowOut[indices[i]] = rowIn[i];
rowOut[indices[i]] += rowIn[i];
}
}
}
@ -458,7 +489,6 @@ void GRUFastForward(Tensor out_, std::vector<Tensor> inputs, bool final) {
#pragma omp simd
for(int i = 0; i < cols; ++i) {
// @TODO: stable sigmoid
float r = stableSigmoid(xWrow[i] + sUrow[i] + b[i]);
int k = i + cols;
@ -901,6 +931,26 @@ void Shift(Tensor out_, Tensor in_, marian::Shape shift, float padValue, bool in
}
}
void ShiftGrad(Tensor out_, Tensor in_, marian::Shape shift, bool invert) {
int offset = 0;
for(int i = 0; i < shift.size(); ++i)
offset += in_->shape().stride(i) * shift[i];
if(invert)
offset = -offset;
float* out = out_->data();
const float* in = in_->data();
int length = out_->shape().elements();
#pragma omp parallel for
for(int i = 0; i < length; ++i) {
if(i - offset >= 0 && i - offset < length) {
out[i] += in[i - offset];
}
}
}
void SetSparse(float* out,
const std::vector<size_t>& indices,
const std::vector<float>& values) {

View File

@ -160,7 +160,6 @@ void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) {
bool broadcast = false;
for(int i = 0; i < K; ++i)
broadcast = broadcast || gOut.shape() != gIns[i].shape();
gAddEqual<<<blocks, threads>>>(functor, gOut, gIns, scale, broadcast);
} else {
int threads = std::min(MAX_THREADS, length);

View File

@ -41,3 +41,4 @@ template void Element<Assign<Var<1>, BinaryFunctor<elem::Clip, Assignee<2>, Capt
template void Element<Assign<Var<1>, BinaryFunctor<elem::LogAddExp, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::LogAddExp, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor, marian::Tensor);
template void Element<Assign<Var<1>, BinaryFunctor<elem::Maximum, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::Maximum, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor, marian::Tensor);
template void Element<Assign<Var<1>, BinaryFunctor<elem::Minimum, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::Minimum, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor, marian::Tensor);
template void Element<Assign<Var<1>, BinaryFunctor<elem::Div, Assignee<1>, Capture>>>(Assign<Var<1>, BinaryFunctor<elem::Div, Assignee<1>, Capture>>, marian::Tensor);

View File

@ -67,6 +67,30 @@ void Prod(marian::Tensor C,
#endif
}
__global__ void gAddBias(float* out, const float* bias, size_t length, size_t cols) {
for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) {
int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
if(index < length) {
size_t index2 = index % cols;
out[index] += bias[index2];
}
}
}
void AddBias(marian::Tensor C, const marian::Tensor bias) {
cudaSetDevice(C->getDevice().no);
int length = C->shape().elements();
int cols = bias->shape().elements();
int threads = std::min(MAX_THREADS, length);
int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
gAddBias<<<blocks, threads>>>(C->data(), bias->data(), length, cols);
cudaStreamSynchronize(0);
}
void ProdWithBias(marian::Tensor C,
const marian::Tensor& A,
const marian::Tensor& B,
@ -76,13 +100,13 @@ void ProdWithBias(marian::Tensor C,
float beta,
float scalar) {
marian::gpu::Prod(C, A, B, transA, transB, beta, scalar);
marian::gpu::Add(functional::_1, 1.f, C, bias);
marian::gpu::AddBias(C, bias);
}
void ProdBatched(marian::Tensor C,
const marian::Tensor& A,
const marian::Tensor& B,
Ptr<Allocator> allocator,
const marian::Tensor A,
const marian::Tensor B,
bool transA,
bool transB,
float beta,
@ -116,30 +140,57 @@ void ProdBatched(marian::Tensor C,
auto cublasHandle = std::static_pointer_cast<gpu::Backend>(C->getBackend())
->getCublasHandle();
int strideA = batchA == 1 ? 0 : m * k;
int strideB = batchB == 1 ? 0 : n * k;
int strideC = n * m;
int batchC = std::max(batchA, batchB);
std::vector<const float*> aptr;
std::vector<const float*> bptr;
std::vector<float*> cptr;
for(int i = 0; i < batchC; i++) {
aptr.push_back(A->data() + (i % batchA) * strideA);
bptr.push_back(B->data() + (i % batchB) * strideB);
cptr.push_back(C->data() + i * strideC);
}
auto mp_aptr = allocator->alloc<const float*>(aptr.size());
CudaCopy(aptr.data(), aptr.data() + aptr.size(), mp_aptr->data<const float*>());
auto mp_bptr = allocator->alloc<const float*>(bptr.size());
CudaCopy(bptr.data(), bptr.data() + bptr.size(), mp_bptr->data<const float*>());
auto mp_cptr = allocator->alloc<float*>(cptr.size());
CudaCopy(cptr.data(), cptr.data() + cptr.size(), mp_cptr->data<float*>());
#if CUDA_VERSION >= 9000
cublasSetMathMode(cublasHandle, CUBLAS_TENSOR_OP_MATH);
#endif
cublasSgemmStridedBatched(cublasHandle,
opB,
opA,
n,
m,
k,
&alpha,
B->data(),
ldb,
batchB == 1 ? 0 : n * k,
A->data(),
lda,
batchA == 1 ? 0 : m * k,
&beta,
C->data(),
ldc,
n * m,
std::max(batchA, batchB));
cublasSgemmBatched(cublasHandle,
opB,
opA,
n,
m,
k,
&alpha,
mp_bptr->data<const float*>(),
ldb,
mp_aptr->data<const float*>(),
lda,
&beta,
mp_cptr->data<float*>(),
ldc,
batchC);
#if CUDA_VERSION >= 9000
cublasSetMathMode(cublasHandle, CUBLAS_DEFAULT_MATH);
#endif
}
allocator->free(mp_aptr);
allocator->free(mp_bptr);
allocator->free(mp_cptr);
}
}
}

View File

@ -26,8 +26,9 @@ void ProdWithBias(marian::Tensor C,
float scalar = 1);
void ProdBatched(marian::Tensor C,
const marian::Tensor& A,
const marian::Tensor& B,
Ptr<Allocator> allocator,
const marian::Tensor A,
const marian::Tensor B,
bool transA,
bool transB,
float beta = 0,

View File

@ -38,6 +38,8 @@ bool IsNan(Tensor in) {
}
void ConcatCont(Tensor out, const std::vector<Tensor>& inputs, int axis) {
cudaSetDevice(out->getDevice().no);
int step = 1;
for(int i = 0; i < axis; ++i)
@ -49,7 +51,7 @@ void ConcatCont(Tensor out, const std::vector<Tensor>& inputs, int axis) {
size_t size = in->shape().elements() / step;
size_t offset2 = i * size;
cudaMemcpyAsync(out->data() + offset1,
cudaMemcpy(out->data() + offset1,
in->data() + offset2,
size * sizeof(float),
cudaMemcpyDeviceToDevice);
@ -60,14 +62,15 @@ void ConcatCont(Tensor out, const std::vector<Tensor>& inputs, int axis) {
cudaStreamSynchronize(0);
}
template <bool add>
__global__ void gInsertCols(float* out,
const float* in,
size_t rows,
size_t cols,
size_t cols_out,
size_t cols_in,
size_t offset_out,
size_t offset_in) {
const float* in,
size_t rows,
size_t cols,
size_t cols_out,
size_t cols_in,
size_t offset_out,
size_t offset_in) {
for(int bid = 0; bid < rows; bid += gridDim.x) {
int j = bid + blockIdx.x;
if(j < rows) {
@ -77,7 +80,10 @@ __global__ void gInsertCols(float* out,
for(int tid = 0; tid < cols; tid += blockDim.x) {
int i = tid + threadIdx.x;
if(i < cols)
rowOut[i] = rowIn[i];
if(add)
rowOut[i] += rowIn[i];
else
rowOut[i] = rowIn[i];
}
}
}
@ -99,16 +105,81 @@ void Concatenate1(Tensor out, const std::vector<Tensor>& inputs) {
int blocks = std::min(MAX_BLOCKS, rows);
int threads = std::min(MAX_THREADS, cols_in);
gInsertCols<<<blocks, threads>>>(
gInsertCols<false><<<blocks, threads>>>(
out->data(), in->data(), rows, cols_in, cols_out, cols_in, offset, 0);
offset += cols_in;
}
cudaStreamSynchronize(0);
}
__global__ void gJoin2(float* out, size_t rowBatch, size_t cols,
const float* in1, size_t inStride1,
const float* in2, size_t inStride2) {
int outStride = inStride1 + inStride2;
int rows = rowBatch * outStride;
for(int bid = 0; bid < rows; bid += gridDim.x) {
int j = bid + blockIdx.x;
if(j < rows) {
float* rowOut = out + j * cols;
int curBatch = j / outStride;
int curPos = j % outStride;
int jIn1 = (curBatch * inStride1) + curPos;
int jIn2 = (curBatch * inStride2) + curPos - inStride1;
const float* rowIn1 = in1 + jIn1 * cols;
const float* rowIn2 = in2 + jIn2 * cols;
for(int tid = 0; tid < cols; tid += blockDim.x) {
int i = tid + threadIdx.x;
if(i < cols) {
if(curPos < inStride1)
rowOut[i] = rowIn1[i];
else
rowOut[i] = rowIn2[i];
}
}
}
}
}
void Concatenate2(Tensor out, Tensor in1, Tensor in2) {
cudaSetDevice(out->getDevice().no);
size_t rows = out->shape().elements() / out->shape().back();
size_t cols = out->shape().back();
size_t rowStride1 = in1->shape()[-2];
size_t rowStride2 = in2->shape()[-2];
size_t rowBatch = rows / out->shape()[-2];
int blocks = std::min(MAX_BLOCKS, (int)rows);
int threads = std::min(MAX_THREADS, (int)cols);
gJoin2<<<blocks, threads>>>(out->data(),
rowBatch,
cols,
in1->data(),
rowStride1,
in2->data(),
rowStride2);
cudaStreamSynchronize(0);
}
void Concatenate(Tensor out, const std::vector<Tensor>& inputs, int ax) {
if(ax == out->shape().size() - 1)
Concatenate1(out, inputs);
else if(ax == out->shape().size() - 2 && inputs.size() == 2)
Concatenate2(out, inputs[0], inputs[1]);
else
ConcatCont(out, inputs, ax);
}
@ -127,13 +198,24 @@ void Split1(std::vector<Tensor>& outputs, const Tensor in) {
int blocks = std::min(MAX_BLOCKS, rows);
int threads = std::min(MAX_THREADS, cols_out);
gInsertCols<<<blocks, threads>>>(
gInsertCols<true><<<blocks, threads>>>(
out->data(), in->data(), rows, cols_out, cols_out, cols_in, 0, offset);
offset += cols_out;
}
cudaStreamSynchronize(0);
}
// @TODO: this function is just a temporary fix until I come up with
// something better for the situation below.
__global__ void gAddRow(float* out, const float* in, int length) {
for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) {
int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
if(index < length) {
out[index] = in[index] + out[index];
}
}
}
void SplitCont(std::vector<Tensor>& outputs, const Tensor in, int axis) {
cudaSetDevice(in->getDevice().no);
@ -141,17 +223,25 @@ void SplitCont(std::vector<Tensor>& outputs, const Tensor in, int axis) {
for(int i = 0; i < axis; ++i)
step *= in->shape()[i];
size_t offset1 = 0;
int offset1 = 0;
for(int i = 0; i < step; ++i) {
for(auto out : outputs) {
size_t size = out->shape().elements() / step;
size_t offset2 = i * size;
int size = out->shape().elements() / step;
int offset2 = i * size;
cudaMemcpyAsync(out->data() + offset2,
in->data() + offset1,
size * sizeof(float),
cudaMemcpyDeviceToDevice);
// BUG: this is does not add gradients
//cudaMemcpyAsync(out->data() + offset2,
// in->data() + offset1,
// size * sizeof(float),
// cudaMemcpyDeviceToDevice);
// @TODO: this is a quick but bad fix for the above bug
int threads = std::min(MAX_THREADS, size);
int blocks = std::min(MAX_BLOCKS, size / threads + (size % threads != 0));
gAddRow<<<blocks, threads>>>(out->data() + offset2,
in->data() + offset1,
size);
offset1 += size;
}
}
@ -165,6 +255,7 @@ void Deconcatenate(std::vector<Tensor>& outputs, const Tensor in, int ax) {
SplitCont(outputs, in, ax);
}
template <bool add>
__global__ void gTransposeND(
functional::Tensor<float> out,
const functional::Tensor<float> in,
@ -180,27 +271,114 @@ __global__ void gTransposeND(
out.shape().dims(index, oDims);
for(int i = 0; i < N; ++i)
pDims[permute[i]] = oDims[i];
out[index] = in[pDims];
if(add)
out[index] += in[pDims];
else
out[index] = in[pDims];
}
}
}
template <bool add>
__global__ void gTranspose0213(float* out, const float* in,
int rows,
int cols,
int stride1,
int stride2) {
int stride = stride1 * stride2;
for(int bid = 0; bid < rows; bid += gridDim.x) {
int j = bid + blockIdx.x;
if(j < rows) {
float* rowOut = out + j * cols;
int z = j / stride;
int y = (j % stride) / stride1;
int x = (j % stride) % stride1;
int j2 = z * stride + x * stride2 + y;
const float* rowIn = in + j2 * cols;
for(int tid = 0; tid < cols; tid += blockDim.x) {
int i = tid + threadIdx.x;
if(i < cols) {
if(add)
rowOut[i] += rowIn[i];
else
rowOut[i] = rowIn[i];
}
}
}
}
}
void TransposeND(Tensor out, Tensor in, const std::vector<int>& vAxis) {
cudaSetDevice(out->getDevice().no);
if(vAxis == std::vector<int>({0, 2, 1, 3})) {
functional::Array<int, functional::Shape::size()> axes;
int diff = functional::Shape::size() - vAxis.size();
for(int i = 0; i < axes.size(); ++i)
if(i < diff)
axes[i] = i;
else
axes[i] = vAxis[i - diff] + diff;
int rows = out->shape().elements() / out->shape().back();
int cols = out->shape().back();
int length = out->shape().elements();
int threads = std::min(MAX_THREADS, length);
int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
int blocks = std::min(MAX_BLOCKS, rows);
int threads = std::min(MAX_THREADS, cols);
gTransposeND<<<blocks, threads>>>(out, in, axes);
int stride1 = out->shape()[-2];
int stride2 = out->shape()[-3];
gTranspose0213<false><<<blocks, threads>>>(out->data(), in->data(),
rows, cols, stride1, stride2);
}
else {
functional::Array<int, functional::Shape::size()> axes;
int diff = functional::Shape::size() - vAxis.size();
for(int i = 0; i < axes.size(); ++i)
if(i < diff)
axes[i] = i;
else
axes[i] = vAxis[i - diff] + diff;
int length = out->shape().elements();
int threads = std::min(MAX_THREADS, length);
int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
gTransposeND<false><<<blocks, threads>>>(out, in, axes);
}
}
void TransposeNDGrad(Tensor out, Tensor in, const std::vector<int>& vAxis) {
cudaSetDevice(out->getDevice().no);
if(vAxis == std::vector<int>({0, 2, 1, 3})) {
int rows = out->shape().elements() / out->shape().back();
int cols = out->shape().back();
int blocks = std::min(MAX_BLOCKS, rows);
int threads = std::min(MAX_THREADS, cols);
int stride1 = out->shape()[-2];
int stride2 = out->shape()[-3];
gTranspose0213<true><<<blocks, threads>>>(out->data(), in->data(),
rows, cols, stride1, stride2);
}
else {
functional::Array<int, functional::Shape::size()> axes;
int diff = functional::Shape::size() - vAxis.size();
for(int i = 0; i < axes.size(); ++i)
if(i < diff)
axes[i] = i;
else
axes[i] = vAxis[i - diff] + diff;
int length = out->shape().elements();
int threads = std::min(MAX_THREADS, length);
int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
gTransposeND<true><<<blocks, threads>>>(out, in, axes);
}
}
__global__ void gSoftmax(float* out,
@ -697,7 +875,7 @@ __global__ void gPasteCols(float* out,
for(int tid = 0; tid < colsIn; tid += blockDim.x) {
int i = tid + threadIdx.x;
if(i < colsIn)
rowOut[targetColIdx[i]] = rowIn[i];
rowOut[targetColIdx[i]] += rowIn[i];
}
}
}
@ -764,7 +942,7 @@ __global__ void gInsert(float* out,
inShape.dims(index, dims);
dims[axis] = d_indices[dims[index]];
int outIndex = outShape.index(dims);
out[outIndex] = in[index];
out[outIndex] += in[index];
}
}
}
@ -1558,14 +1736,21 @@ void LayerNormalizationGrad(Tensor gradX,
eps);
}
template <bool add>
__global__ void gShift(float* out, const float* in, int length, int offset, float padValue) {
for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) {
int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
if(index < length) {
if(index - offset < 0 || index - offset >= length)
out[index] = padValue;
else
out[index] = in[index - offset];
if(add) {
if(index - offset >= 0 && index - offset < length)
out[index] += in[index - offset];
}
else {
if(index - offset < 0 || index - offset >= length)
out[index] = padValue;
else
out[index] = in[index - offset];
}
}
}
}
@ -1588,7 +1773,28 @@ void Shift(Tensor out, Tensor in, marian::Shape shift, float padValue, bool inve
int threads = std::min(MAX_THREADS, length);
int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
gShift<<<blocks, threads>>>(out->data(), in->data(), length, offset, padValue);
gShift<false><<<blocks, threads>>>(out->data(), in->data(), length, offset, padValue);
}
void ShiftGrad(Tensor out, Tensor in, marian::Shape shift, bool invert) {
ABORT_IF(in->shape().size() != shift.size(), "bad dimensions");
// BUGBUG: This can only shift along the first axis. Shifting, e.g., along the last axis cannot be implemented this way.
int offset = 0;
for(int i = 0; i < shift.size(); ++i)
offset += in->shape().stride(i) * shift[i];
if(invert)
offset = -offset;
cudaSetDevice(out->getDevice().no);
int length = out->shape().elements();
int threads = std::min(MAX_THREADS, length);
int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
gShift<true><<<blocks, threads>>>(out->data(), in->data(), length, offset, 0.f);
}
__global__ void gSetSparse(float* out,

View File

@ -64,7 +64,8 @@ void Reduce(Functor functor, marian::Tensor out, Tensors... tensors) {
// clang-format off
DISPATCH7(Prod, marian::Tensor, const marian::Tensor&, const marian::Tensor&, bool, bool, float, float)
DISPATCH8(ProdWithBias, marian::Tensor, const marian::Tensor&, const marian::Tensor&, const marian::Tensor&, bool, bool, float, float)
DISPATCH7(ProdBatched, marian::Tensor, const marian::Tensor&, const marian::Tensor&, bool, bool, float, float)
DISPATCH8(ProdBatched, marian::Tensor, Ptr<Allocator>, const marian::Tensor, const marian::Tensor, bool, bool, float, float)
DISPATCH2(Dropout, marian::Tensor, float)
@ -78,7 +79,10 @@ void Reduce(Functor functor, marian::Tensor out, Tensors... tensors) {
DISPATCH4(CrossEntropyPickBackward, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor)
DISPATCH3(TransposeND, marian::Tensor, marian::Tensor, const std::vector<int>&)
DISPATCH3(TransposeNDGrad, marian::Tensor, marian::Tensor, const std::vector<int>&)
DISPATCH5(Shift, marian::Tensor, marian::Tensor, marian::Shape, float, bool)
DISPATCH4(ShiftGrad, marian::Tensor, marian::Tensor, marian::Shape, bool)
DISPATCH3(Concatenate, marian::Tensor, const std::vector<marian::Tensor>&, int)
// clang-format on

View File

@ -0,0 +1,13 @@
#include "training/communicator.h"
namespace marian {
// Compile this if cuda is not being compiled.
// Version with CUDA and/or NCCL is compiled in communicator.cu
#ifndef CUDA_FOUND
Ptr<Communicator> createCommunicator(const std::vector<Ptr<ExpressionGraph>>& graphs, bool noNccl) {
return New<DefaultCommunicator>(graphs);
}
#endif
}

View File

@ -0,0 +1,239 @@
#include "training/communicator.h"
#include "functional/functional.h"
#include "tensors/tensor_operators.h"
#ifdef USE_NCCL
#include "cuda_runtime.h"
#include "nccl.h"
#endif
namespace marian {
#ifdef USE_NCCL
class NCCLCommunicator : public Communicator {
private:
std::vector<ncclComm_t> comms_;
std::vector<cudaStream_t> streams_;
std::vector<int> devices_;
void synchronizeAll() {
for(int i = 0; i < graphs_.size(); ++i) {
cudaSetDevice(devices_[i]);
cudaStreamSynchronize(streams_[i]);
}
}
public:
NCCLCommunicator(const std::vector<Ptr<ExpressionGraph>>& graphs)
: Communicator(graphs),
comms_(graphs.size()),
streams_(graphs.size()),
devices_(graphs.size())
{
LOG(info, "[comm] Using NCCL library for GPU communication");
for(int i = 0; i < graphs_.size(); ++i) {
auto device = graphs_[i]->getBackend()->getDevice();
ABORT_IF(device.type != DeviceType::gpu,
"NCCL communicator can only be used with GPUs");
devices_[i] = device.no;
cudaSetDevice(devices_[i]);
cudaStreamCreate(&streams_[i]);
}
ncclCommInitAll(comms_.data(), devices_.size(), devices_.data());
}
~NCCLCommunicator() override {
for(int i = 0; i < devices_.size(); ++i) {
cudaSetDevice(devices_[i]);
cudaStreamDestroy(streams_[i]);
ncclCommDestroy(comms_[i]);
}
}
void scatterReduce() override {
int totalSize = graphs_[0]->params()->vals()->size();
int shardSize = ceil(totalSize / (float)graphs_.size());
int pos = 0;
ncclGroupStart();
for(int i = 0; i < graphs_.size(); ++i) {
int size = std::min(shardSize, totalSize);
const void* sendbuff = (const void*)graphs_[i]->params()->grads()->data();
auto subgrad = graphs_[i]->params()->grads()->subtensor(pos, size);
void* recvbuff = subgrad->data();
ncclReduceScatter(sendbuff,
recvbuff,
shardSize,
ncclFloat,
ncclSum,
comms_[i],
streams_[i]);
pos += size;
totalSize -= size;
}
ncclGroupEnd();
synchronizeAll();
}
void allGather() override {
int totalSize = graphs_[0]->params()->vals()->size();
int shardSize = ceil(totalSize / (float)graphs_.size());
int pos = 0;
ncclGroupStart();
for(int i = 0; i < graphs_.size(); ++i) {
int size = std::min(shardSize, totalSize);
auto subparam = graphs_[i]->params()->vals()->subtensor(pos, size);
const void* sendbuff = (const void*)subparam->data();
void* recvbuff = (void*)graphs_[i]->params()->vals()->data();
ncclAllGather(sendbuff,
recvbuff,
shardSize,
ncclFloat,
comms_[i],
streams_[i]);
pos += size;
totalSize -= size;
}
ncclGroupEnd();
synchronizeAll();
}
void swapParams(const std::vector<Tensor>& params) override {
// Update all graphs with parameter shard
ABORT_IF(graphs_.size() < 2, "Swap requires at least two graphs");
auto gather = [this, params](size_t idx, int pos) {
// copy parameter shard to each graph, apart from last graph
for(int i = 0; i < graphs_.size() - 1; ++i) {
auto subParam = graphs_[i]->params()->vals()->subtensor(pos, params[idx]->size());
subParam->copyFrom(params[idx]);
}
// back-up shard from last graph
auto subParamLast = graphs_.back()->params()->vals()->subtensor(pos, params[idx]->size());
params[idx]->copyFrom(subParamLast);
auto subParamFirst = graphs_[0]->params()->vals()->subtensor(pos, params[idx]->size());
subParamLast->copyFrom(subParamFirst);
};
// execute for each shard
this->foreach(gather);
}
void pushParams(std::vector<Tensor>& params) override {
// Copy paramter shard from i-th graph to shard params[i].
// Graphs and shards with the same index live on the same device.
auto copy = [this, params](size_t idx, int pos) {
// copy parameter shard to each graph
auto subParam = graphs_[idx]->params()->vals()->subtensor(pos, params[idx]->size());
params[idx]->copyFrom(subParam);
};
this->foreach(copy);
}
void pullParams(const std::vector<Tensor>& params) override {
// Update all graphs with parameter shard
auto gather = [this, params](size_t idx, int pos) {
// copy parameter shard to each graph
for(auto graph : graphs_) {
auto subParam = graph->params()->vals()->subtensor(pos, params[idx]->size());
subParam->copyFrom(params[idx]);
}
};
this->foreach(gather);
}
// Doesn't work yet with NCCL
// void pushParams(std::vector<Tensor>& params) {
// // Copy paramter shard from i-th graph to shard params[i].
// // Graphs and shards with the same index live on the same device.
// int pos = 0;
// for(int i = 0; i < graphs_.size(); ++i) {
// auto subParam = graphs_[i]->params()->vals()->subtensor(pos, params[i]->size());
// ncclGroupStart();
// ncclBroadcast((const void*)subParam->data(),
// (void*)params[i]->data(),
// params[i]->size(),
// ncclFloat,
// 0,
// comms_[i],
// streams_[i]);
// ncclGroupEnd();
// pos += params[i]->size();
// }
// synchronizeAll();
// }
// void pullParams(const std::vector<Tensor>& params) {
// // Update all graphs with parameter shard
// int totalSize = graphs_[0]->params()->vals()->size();
// int shardSize = ceil(totalSize / (float)graphs_.size());
// ncclGroupStart();
// for(int i = 0; i < graphs_.size(); ++i) {
// const void* sendbuff = (const void*)params[i]->data();
// void* recvbuff = (void*)graphs_[i]->params()->vals()->data();
// ncclAllGather(sendbuff,
// recvbuff,
// shardSize,
// ncclFloat,
// comms_[i],
// streams_[i]);
// }
// ncclGroupEnd();
// synchronizeAll();
// }
};
#endif
Ptr<Communicator> createCommunicator(const std::vector<Ptr<ExpressionGraph>>& graphs, bool noNccl) {
#ifdef USE_NCCL
if(noNccl) {
LOG(warn, "[comm] NCCL communicator overridden");
return New<DefaultCommunicator>(graphs);
}
// if at least one of the devices is not a gpu, fall-back to default
for(auto& graph : graphs) {
if(graph->getBackend()->getDevice().type == DeviceType::cpu) {
return New<DefaultCommunicator>(graphs);
}
}
size_t d = graphs.size();
if((d & (d - 1)) != 0) {
LOG(warn, "[comm] Number of devices {} is not a power of 2 and communication might be slow with NCCL", d);
LOG(warn, "[comm] You can switch off NCCL with --no-nccl option", d);
}
return New<NCCLCommunicator>(graphs);
#else
return New<DefaultCommunicator>(graphs);
#endif
}
}

178
src/training/communicator.h Normal file
View File

@ -0,0 +1,178 @@
#include "graph/expression_graph.h"
#include "functional/functional.h"
#include "tensors/tensor_operators.h"
namespace marian {
class Communicator {
protected:
const std::vector<Ptr<ExpressionGraph>> graphs_;
public:
Communicator(const std::vector<Ptr<ExpressionGraph>>& graphs)
: graphs_(graphs) {}
virtual ~Communicator() {}
virtual void foreach(const std::function<void(size_t, int)>& func) {
int totalSize = graphs_[0]->params()->vals()->size();
int shardSize = ceil(totalSize / (float)graphs_.size());
int pos = 0;
std::vector<std::thread> group;
// iterate over all shards
for(int idx = 0; idx < graphs_.size(); ++idx) {
int size = std::min(shardSize, totalSize);
group.emplace_back(func, idx, pos);
pos += size;
totalSize -= size;
}
for(auto& t : group)
t.join();
}
virtual void scatterReduce() = 0;
virtual void allGather() = 0;
virtual void pushParams(std::vector<Tensor>& params) = 0;
virtual void pullParams(const std::vector<Tensor>& params) = 0;
virtual void swapParams(const std::vector<Tensor>& params) = 0;
};
class DefaultCommunicator : public Communicator {
private:
std::vector<Ptr<TensorAllocator>> paramsAllocs_;
std::vector<Tensor> tmpTensors_;
void init() {
if(tmpTensors_.size() == 0) {
int totalSize = graphs_[0]->params()->vals()->size();
int shardSize = ceil(totalSize / (float)graphs_.size());
int pos = 0;
for(auto graph : graphs_) {
int __size__ = std::min(shardSize, totalSize);
auto paramsAlloc = New<TensorAllocator>(graph->getBackend());
paramsAllocs_.push_back(paramsAlloc);
paramsAlloc->reserveExact(__size__ * sizeof(float));
Tensor tmp;
paramsAlloc->allocate(tmp, {1, __size__});
tmpTensors_.push_back(tmp);
// move to next shard
pos += __size__;
totalSize -= __size__;
}
}
}
public:
DefaultCommunicator(const std::vector<Ptr<ExpressionGraph>>& graphs)
: Communicator(graphs) {}
~DefaultCommunicator() override {}
void scatterReduce() override {
init();
int totalSize = graphs_[0]->params()->vals()->size();
int shardSize = ceil(totalSize / (float)graphs_.size());
// Gather gradients from different devices into current gradient shards
auto scatter = [this, shardSize](size_t idx, int pos) {
auto curGrad = graphs_[idx]->params()->grads()->subtensor(pos, shardSize);
// collect and sum gradients
// to be replaced with ncclScatterReduce
for(auto graph : graphs_) {
if(graph != graphs_[idx]) {
auto subGrad = graph->params()->grads()->subtensor(pos, shardSize);
tmpTensors_[idx]->copyFrom(subGrad);
using namespace functional;
Element(_1 = _1 + _2, curGrad, tmpTensors_[idx]);
}
}
};
this->foreach(scatter);
}
void allGather() override {
int totalSize = graphs_[0]->params()->vals()->size();
int shardSize = ceil(totalSize / (float)graphs_.size());
// Update all graphs with parameter shard
auto gather = [this, shardSize](size_t idx, int pos) {
auto curParam = graphs_[idx]->params()->vals()->subtensor(pos, shardSize);
// copy parameter shard to each graph
for(auto graph : graphs_) {
if(graph != graphs_[idx]) {
auto subParam = graph->params()->vals()->subtensor(pos, shardSize);
subParam->copyFrom(curParam);
}
}
};
this->foreach(gather);
}
void pushParams(std::vector<Tensor>& params) override {
// Copy paramter shard from i-th graph to shard params[i].
// Graphs and shards with the same index live on the same device.
auto copy = [this, params](size_t idx, int pos) {
// copy parameter shard to each graph
auto subParam = graphs_[idx]->params()->vals()->subtensor(pos, params[idx]->size());
params[idx]->copyFrom(subParam);
};
this->foreach(copy);
}
void pullParams(const std::vector<Tensor>& params) override {
// Update all graphs with parameter shard
auto gather = [this, params](size_t idx, int pos) {
// copy parameter shard to each graph
for(auto graph : graphs_) {
auto subParam = graph->params()->vals()->subtensor(pos, params[idx]->size());
subParam->copyFrom(params[idx]);
}
};
this->foreach(gather);
}
void swapParams(const std::vector<Tensor>& params) override {
// Update all graphs with parameter shard
ABORT_IF(graphs_.size() < 2, "Swap requires at least two graphs");
auto gather = [this, params](size_t idx, int pos) {
// copy parameter shard to each graph, apart from last graph
for(int i = 0; i < graphs_.size() - 1; ++i) {
auto subParam = graphs_[i]->params()->vals()->subtensor(pos, params[idx]->size());
subParam->copyFrom(params[idx]);
}
// back-up shard from last graph
auto subParamLast = graphs_.back()->params()->vals()->subtensor(pos, params[idx]->size());
params[idx]->copyFrom(subParamLast);
auto subParamFirst = graphs_[0]->params()->vals()->subtensor(pos, params[idx]->size());
subParamLast->copyFrom(subParamFirst);
};
// execute for each shard
this->foreach(gather);
}
};
Ptr<Communicator> createCommunicator(const std::vector<Ptr<ExpressionGraph>>& graphs, bool noNccl = false);
}

View File

@ -1,99 +1,139 @@
#include "training/gradient_dropping/gpu/sparse_algorithm.h"
#include "tensors/gpu/algorithm.h"
#include "tensors/gpu/cuda_helpers.h"
#include <curand.h>
#include <curand_kernel.h>
#include <thrust/binary_search.h>
#include <thrust/copy.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/copy.h>
#include <thrust/binary_search.h>
#include "tensors/gpu/algorithm.h"
#include "tensors/gpu/cuda_helpers.h"
namespace marian {
namespace gpu {
struct non_zero
{
__host__ __device__
bool operator()(const float x)
{
return x != 0;
}
};
namespace gpu {
struct non_zero {
__host__ __device__ bool operator()(const float x) { return x != 0; }
};
__global__ void copy_id(float* data,
int* indices,
float* out,
int size) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
if(idx >= size)
return;
out[idx] = data[indices[idx]];
}
__global__ void copy_id(float* data, int* indices, float* out, int size) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
if(idx >= size)
return;
out[idx] = data[indices[idx]];
}
__global__ void gScatterAdd(float* denseData,
__global__ void gScatterAdd(float* denseData,
float* sparseData,
int* sparseIndices,
int denseSize,
int sparseSize,
int offset) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
if(idx >= sparseSize)
return;
if(sparseIndices[idx] >= -offset
&& sparseIndices[idx] + offset < denseSize)
denseData[sparseIndices[idx] + offset] += sparseData[idx];
}
std::vector<int> lower_bounds(int* data, std::vector<int> values, int size, DeviceId device) {
cudaSetDevice(device.no);
thrust::device_ptr<int> data_ptr(data);
thrust::device_vector<int> d_values(values);
thrust::device_vector<int> d_output(values.size());
thrust::lower_bound(data_ptr, data_ptr + size,
d_values.begin(), d_values.end(),
d_output.begin());
std::vector<int> output(values.size());
thrust::copy(d_output.begin(), d_output.end(), output.begin());
return output;
}
int buildSparse(Tensor t, float* data, int* indices) {
cudaSetDevice(t->getDevice().no);
using namespace thrust;
device_ptr<float> grad_ptr(t->data());
device_ptr<float> sparse_grad_ptr(data);
device_ptr<int> indices_ptr(indices);
int sparse_size = copy_if(make_counting_iterator<int>(0),
make_counting_iterator<int>(t->size()),
grad_ptr,
indices_ptr,
non_zero()) - indices_ptr;
int threads = 512;
int blocks = 1 + t->size() / threads;
copy_id<<<blocks, threads>>>(t->data(), indices, data, sparse_size);
return sparse_size;
}
void scatterAdd(Tensor t, float* data, int *indices, int size, int offset) {
cudaSetDevice(t->getDevice().no);
int threads = 512;
int blocks = 1 + size / threads;
gScatterAdd<<<blocks, threads>>>(
t->data(), data, indices, t->size(), size, offset);
cudaStreamSynchronize(0);
}
}
int idx = blockDim.x * blockIdx.x + threadIdx.x;
if(idx >= sparseSize)
return;
if(sparseIndices[idx] >= -offset && sparseIndices[idx] + offset < denseSize)
denseData[sparseIndices[idx] + offset] += sparseData[idx];
}
__global__ void gScatterUpdate(float* denseData,
float* sparseData,
int* sparseIndices,
int denseSize,
int sparseSize,
int offset) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
if(idx >= sparseSize)
return;
if(sparseIndices[idx] >= -offset && sparseIndices[idx] + offset < denseSize)
denseData[sparseIndices[idx] + offset] = sparseData[idx];
}
__global__ void gGather(float* denseData,
float* sparseData,
int* sparseIndices,
int denseSize,
int sparseSize,
int offset) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
if(idx >= sparseSize)
return;
if(sparseIndices[idx] >= -offset && sparseIndices[idx] + offset < denseSize)
sparseData[idx] = denseData[sparseIndices[idx] + offset];
}
std::vector<int> lower_bounds(int* data,
std::vector<int> values,
int size,
DeviceId device) {
cudaSetDevice(device.no);
thrust::device_ptr<int> data_ptr(data);
thrust::device_vector<int> d_values(values);
thrust::device_vector<int> d_output(values.size());
thrust::lower_bound(data_ptr,
data_ptr + size,
d_values.begin(),
d_values.end(),
d_output.begin());
std::vector<int> output(values.size());
thrust::copy(d_output.begin(), d_output.end(), output.begin());
return output;
}
int buildSparse(Tensor t, float* data, int* indices) {
cudaSetDevice(t->getDevice().no);
using namespace thrust;
device_ptr<float> grad_ptr(t->data());
device_ptr<float> sparse_grad_ptr(data);
device_ptr<int> indices_ptr(indices);
int sparse_size = copy_if(make_counting_iterator<int>(0),
make_counting_iterator<int>(t->size()),
grad_ptr,
indices_ptr,
non_zero())
- indices_ptr;
int threads = 512;
int blocks = 1 + t->size() / threads;
copy_id<<<blocks, threads>>>(t->data(), indices, data, sparse_size);
return sparse_size;
}
void scatterAdd(Tensor t, float* data, int* indices, int size, int offset) {
cudaSetDevice(t->getDevice().no);
int threads = 512;
int blocks = 1 + size / threads;
gScatterAdd<<<blocks, threads>>>(
t->data(), data, indices, t->size(), size, offset);
cudaStreamSynchronize(0);
}
void scatterUpdate(Tensor t, float* data, int* indices, int size, int offset) {
cudaSetDevice(t->getDevice().no);
int threads = 512;
int blocks = 1 + size / threads;
gScatterUpdate<<<blocks, threads>>>(
t->data(), data, indices, t->size(), size, offset);
cudaStreamSynchronize(0);
}
void gather(Tensor t, float* data, int* indices, int size, int offset) {
cudaSetDevice(t->getDevice().no);
int threads = 512;
int blocks = 1 + size / threads;
gGather<<<blocks, threads>>>(
t->data(), data, indices, t->size(), size, offset);
cudaStreamSynchronize(0);
}
}
}

View File

@ -4,17 +4,24 @@
#include "tensors/backend.h"
#include "tensors/tensor.h"
namespace marian {
namespace gpu {
// output is a vector of size values.size. Output[i] is lower_bound of values[i] in data
std::vector<int> lower_bounds(int* data,
std::vector<int> values,
int size,
DeviceId device);
namespace gpu {
/**
* @brief Output[i] is lower_bound of values[i] in data.
*
* @return A vector of size values.size
*/
std::vector<int> lower_bounds(int* data,
std::vector<int> values,
int size,
DeviceId device);
int buildSparse(Tensor t, float* data, int* indices);
int buildSparse(Tensor t, float* data, int* indices);
void scatterAdd(Tensor t, float* data, int *indices, int size, int offset);
}
}
void scatterAdd(Tensor t, float* data, int* indices, int size, int offset);
void scatterUpdate(Tensor t, float* data, int* indices, int size, int offset);
void gather(Tensor t, float* data, int* indices, int size, int offset);
}
}

View File

@ -1,14 +1,15 @@
#pragma once
#include <algorithm>
#include <memory>
#include "common/definitions.h"
#include "tensors/backend.h"
#include "tensors/tensor_operators.h"
#include "tensors/device.h"
#include "tensors/tensor_operators.h"
#ifdef CUDA_FOUND
#include "tensors/gpu/algorithm.h"
#include "training/gradient_dropping/gpu/sparse_algorithm.h"
#endif
@ -20,26 +21,26 @@ class SparseTensorBase : public std::enable_shared_from_this<SparseTensorBase> {
int size_;
int capacity_;
Ptr<Backend> backend_;
std::vector<Ptr<Device>> devices;
template<typename T>
template <typename T>
T* newData(int size, Ptr<Backend> backend) {
Ptr<Device> device = DispatchDevice(backend->getDevice());
device->reserve(size * sizeof(T));
devices.push_back(device);
return (T*)device->data();
}
public:
SparseTensorBase(int capacity, Ptr<Backend> backend)
: backend_(backend), capacity_(capacity) {
: backend_(backend), capacity_(capacity) {
data_ = newData<float>(capacity, backend);
indices_ = newData<int>(capacity, backend);
}
SparseTensorBase(float* data, int* indices, int size, Ptr<Backend> backend)
: backend_(backend) {
SparseTensorBase(float* data, int* indices, int size, Ptr<Backend> backend)
: backend_(backend) {
data_ = data;
indices_ = indices;
size_ = size;
@ -60,6 +61,37 @@ public:
int* indices() { return indices_; }
// copy to cpu vector
void get(std::vector<float>& g, std::vector<int>& i) {
int s = std::min((int)g.size(), size());
if(backend_->getDevice().type == DeviceType::cpu) {
std::copy(data(), data() + s, g.data());
std::copy(indices(), indices() + s, i.data());
}
#ifdef CUDA_FOUND
else {
gpu::copy(backend_, data(), data() + s, g.data());
gpu::copy(backend_, indices(), indices() + s, i.data());
}
#endif
}
// copy from cpu vector
void set(const std::vector<float>& g, const std::vector<int>& i) {
int s = std::min((int)g.size(), capacity());
size_ = s;
if(backend_->getDevice().type == DeviceType::cpu) {
std::copy(g.data(), g.data() + s, data());
std::copy(i.data(), i.data() + s, indices());
}
#ifdef CUDA_FOUND
else {
gpu::copy(backend_, g.data(), g.data() + s, data());
gpu::copy(backend_, i.data(), i.data() + s, indices());
}
#endif
}
void copyFrom(float* ndata, int* nindices, int nsize) {
size_ = nsize;
if(backend_->getDevice().type == DeviceType::cpu) {
@ -77,11 +109,13 @@ public:
copyFrom(t->data(), t->indices(), t->size());
}
void toDense(Tensor t, int offset) {
// Convert sparseTensor into a Tensor
void toDense(Tensor t, int offset = 0) {
t->set(0);
scatterAdd(t, offset);
}
// Convert a tensor into a sparse tensor format
void fromDense(Tensor t) {
if(backend_->getDevice().type == DeviceType::cpu) {
ABORT("Gradient Dropping for CPU is not yet supported");
@ -94,6 +128,7 @@ public:
#endif
}
// Add t[indices[i]] += data[i]
void scatterAdd(Tensor t, int offset = 0) {
if(backend_->getDevice().type == DeviceType::cpu) {
ABORT("Gradient Dropping for CPU is not yet supported");
@ -105,6 +140,30 @@ public:
#endif
}
// Add t[indices[i]] = data[i]
void scatterUpdate(Tensor t, int offset = 0) {
if(backend_->getDevice().type == DeviceType::cpu) {
ABORT("Gradient Dropping for CPU is not yet supported");
}
#ifdef CUDA_FOUND
else {
gpu::scatterUpdate(t, data(), indices(), size(), offset);
}
#endif
}
// data[i] = t[indices[i]]
void gather(Tensor t, int offset = 0) {
if(backend_->getDevice().type == DeviceType::cpu) {
ABORT("Gradient Dropping for CPU is not yet supported");
}
#ifdef CUDA_FOUND
else {
gpu::gather(t, data(), indices(), size(), offset);
}
#endif
}
std::shared_ptr<SparseTensorBase> subtensor(int pos, int subsize) {
int startOffset = 0;
int endOffset = 0;
@ -118,8 +177,8 @@ public:
}
#ifdef CUDA_FOUND
else {
std::vector<int> outputs = gpu::lower_bounds(
indices(), values, size(), backend_->getDevice());
std::vector<int> outputs
= gpu::lower_bounds(indices(), values, size(), backend_->getDevice());
startOffset = outputs[0];
endOffset = outputs[1];

View File

@ -32,7 +32,7 @@ public:
virtual ~GraphGroup() {}
virtual void update(Ptr<data::Batch>) = 0;
virtual void update(Ptr<data::Batch> batch) = 0;
virtual void load() = 0;

View File

@ -8,113 +8,74 @@
namespace marian {
Tensor AsyncGraphGroupDrop::newTensor(int size, Ptr<Backend> backend) {
Tensor t;
Ptr<TensorAllocator> allocator_ = New<TensorAllocator>(backend);
allocator_->reserveExact(size * sizeof(float));
allocator_->allocate(t, {1, size});
allocators.push_back(allocator_);
return t;
}
void AsyncGraphGroupDrop::fetchParams(Tensor oldParams,
const std::vector<Tensor>& params,
int device_id) {
using namespace functional;
// @TODO read guard on parameters
int pos = 0;
// Full fetch when fetching moving average OR still in warm-up period.
if(&params == &paramsAvg_ || fetchStep_[device_id]++ <= dropping_warmup) {
AsyncGraphGroup::fetchParams(oldParams, params, device_id);
return;
}
std::vector<std::thread> threads;
for(int i = 0; i < devices_.size(); i++) {
int pos = 0;
for(int idx = 0; idx < devices_.size(); idx++) {
threads.emplace_back(std::thread(
[&](int idx, int pos) {
[=](int idx, int pos) {
auto sparseGrad = sparseGrads_[device_id][idx];
auto sparseShard = sparseShards_[device_id][idx];
// individual mutex per-shard
std::lock_guard<std::mutex> guard(shardSync_[idx]);
// normal fetch
if(fetchStep_[device_id] <= dropping_warmup
|| &params == &paramsAvg_) { // Do not use sparse fetch when
// fetching from paramsAvg
oldParams->subtensor(pos, params[idx]->size())
->copyFrom(params[idx]);
paramsLocal_[device_id][idx]->copyFrom(params[idx]);
return;
}
// sparse fetch
// get delta : params latest version - current param (locally)
Element(_1 = _2 - _3,
paramsDelta_[idx],
params[idx],
paramsLocal_[device_id][idx]);
// update current local param
paramsLocal_[device_id][idx]->copyFrom(params[idx]);
// get sparse delta
fetchDropper[device_id][idx]->dropGraph(paramsDelta_[idx],
fetchSparseGradient_[idx],
droping_rate,
dropping_momentum);
// move sparse delta
fetchShardedSparseGradient_[device_id][idx]->copyFrom(
fetchSparseGradient_[idx]);
fetchShardedSparseGradient_[device_id][idx]->scatterAdd(
sparseShard->gather(params[idx]);
sparseGrad->copyFrom(sparseShard);
sparseGrad->scatterUpdate(
oldParams->subtensor(pos, params[idx]->size()));
},
i,
idx,
pos));
pos += shardSize_;
}
#if 0
for(auto&& t : threads)
t.join();
// BUGBUG [compiler]: This fails to compile on VS 2015, for the comparison of the iterator with end()
#else
for (size_t i = 0; i < threads.size(); i++)
threads[i].join();
#endif
fetchStep_[device_id]++;
for(size_t i = 0; i < threads.size(); i++)
threads[i].join();
}
void AsyncGraphGroupDrop::pushGradients(Tensor newGrads,
size_t batch_words,
int device_id) {
if(pushStep_[device_id]++ <= dropping_warmup) {
if(pushStep_[device_id]++ < dropping_warmup) {
AsyncGraphGroup::pushGradients(newGrads, batch_words, device_id);
return;
}
// get the sparse gradient
pushDropper_[device_id]->dropGraph(newGrads,
pushSparseGradient_[device_id],
droping_rate,
dropping_momentum);
SparseTensor newSparseGrads = pushSparseGradient_[device_id];
// add instead of copy?
std::vector<std::thread> threads;
int pos = 0;
for(int idx = 0; idx < devices_.size(); idx++) {
threads.emplace_back(std::thread(
[=](int idx, int pos) {
auto dropper = droppers_[device_id][idx];
auto sparseGrad = sparseGrads_[device_id][idx];
auto sparseShard = sparseShards_[device_id][idx];
auto tensor = newGrads->subtensor(pos, grads_[idx]->size());
// individual mutex per-shard
std::lock_guard<std::mutex> guard(shardSync_[idx]);
// split to shard
SparseTensor subGrad
= newSparseGrads->subtensor(pos, grads_[idx]->size());
// drop the gradients
dropper->dropGraph(
tensor, sparseGrad, droping_rate, dropping_momentum);
// send the sharded sparse tensor
pushShardedSparseGradient_[idx]->copyFrom(subGrad);
sparseShard->copyFrom(sparseGrad);
// convert back to dense, store it in grads_[idx]
pushShardedSparseGradient_[idx]->toDense(grads_[idx], -pos);
// sparseShard indices is equal to the indices of the sparse gradient
// which will be used for sparse fetching
sparseShard->toDense(grads_[idx]);
// optimize
if(scaleLearningRate_) {
shardOpt_[idx]->update(
params_[idx], grads_[idx], batch_words / avgBatchWords_);
@ -125,7 +86,6 @@ void AsyncGraphGroupDrop::pushGradients(Tensor newGrads,
if(movingAvg_)
updateMovingAverage(
paramsAvg_[idx], params_[idx], scheduler_->numberOfBatches());
},
idx,
pos));
@ -140,51 +100,34 @@ void AsyncGraphGroupDrop::init(Ptr<data::Batch> batch) {
AsyncGraphGroup::init(batch);
// extra inits for gradient dropping
if(drop_first) {
int totalSize = graphs_[0]->params()->vals()->size();
int sparseCap = totalSize * 1.5 * (1.0 - droping_rate);
int shardSize = ceil(totalSize / devices_.size());
for(int i = 0; i < devices_.size(); i++)
paramsLocal_.push_back(std::vector<Tensor>());
for(int i = 0; i < devices_.size(); i++) {
// warm-up counter
fetchStep_.push_back(0);
pushStep_.push_back(0);
fetch_ready.push_back(false);
// temporary tensor to compute parameter delta before fetching
paramsDelta_.push_back(newTensor(shardSize, graphs_[i]->getBackend()));
// Size of the sparse tensor
int totalSize = graphs_[0]->params()->vals()->size();
int sparseCap = totalSize * 1.2 * (1.0 - droping_rate);
// tensors to store local params history
for(int h_id = 0; h_id < devices_.size(); h_id++) {
Tensor tmp = newTensor(params_[i]->size(), graphs_[i]->getBackend());
tmp->copyFrom(params_[i]);
paramsLocal_[h_id].push_back(tmp);
}
// individual Gradient dropper per-device
pushDropper_.push_back(PrepareGradientDrop(graphs_[i]->getDevice()));
// N-dropper for fetch
// prepare droppers
std::vector<GradientDrop> tmpDropper;
for(auto device : devices_)
tmpDropper.push_back(PrepareGradientDrop(graphs_[i]->getDevice()));
fetchDropper.push_back(tmpDropper);
// sparsetensor to store sparsified gradients per-device
pushSparseGradient_.push_back(SparseTensor(
new SparseTensorBase(sparseCap, graphs_[i]->getBackend())));
pushShardedSparseGradient_.push_back(SparseTensor(
new SparseTensorBase(sparseCap, graphs_[i]->getBackend())));
fetchSparseGradient_.push_back(SparseTensor(new SparseTensorBase(
sparseCap / devices_.size(), graphs_[i]->getBackend())));
droppers_.push_back(tmpDropper);
// sparsetensor to store sparsified gradients per-device per-shard
std::vector<SparseTensor> tmp;
for(int j = 0; j < devices_.size(); j++)
tmp.push_back(SparseTensor(new SparseTensorBase(
sparseCap / devices_.size(), graphs_[i]->getBackend())));
fetchShardedSparseGradient_.push_back(tmp);
sparseGrads_.push_back(tmp);
std::vector<SparseTensor> tmp2;
for(int j = 0; j < devices_.size(); j++)
tmp2.push_back(SparseTensor(new SparseTensorBase(
sparseCap / devices_.size(), graphs_[j]->getBackend())));
sparseShards_.push_back(tmp2);
}
drop_first = false;
}

View File

@ -10,6 +10,7 @@ namespace marian {
class AsyncGraphGroupDrop : public AsyncGraphGroup {
std::vector<int> fetchStep_;
std::vector<int> pushStep_;
std::vector<bool> fetch_ready;
bool drop_first = 1;
@ -17,21 +18,9 @@ class AsyncGraphGroupDrop : public AsyncGraphGroup {
float droping_rate;
float dropping_momentum;
std::vector<GradientDrop> pushDropper_;
std::vector<std::vector<GradientDrop>> fetchDropper;
std::vector<std::vector<GradientDrop>> droppers_;
std::vector<SparseTensor> pushSparseGradient_;
std::vector<SparseTensor> pushShardedSparseGradient_;
std::vector<SparseTensor> fetchSparseGradient_;
std::vector<std::vector<SparseTensor>> fetchShardedSparseGradient_;
std::vector<Tensor> paramsDelta_;
std::vector<std::vector<Tensor>> paramsLocal_;
std::vector<Ptr<TensorAllocator>> allocators;
Tensor newTensor(int size, Ptr<Backend> backend);
std::vector<std::vector<SparseTensor>> sparseGrads_, sparseShards_;
protected:
void init(Ptr<data::Batch> batch);

View File

@ -52,19 +52,19 @@ void MultiNodeGraphGroup::init(Ptr<data::Batch> batch) {
}
// setup delayed gradient storage
if (tau_ > 1) {
if(tau_ > 1) {
delay_count = std::vector<size_t>(mpi_comm_world_size_);
totalBatchWords = std::vector<int>(mpi_comm_world_size_);
optDelayMutex_ = std::vector<std::mutex>(mpi_comm_world_size_);
for (int i = 0;i < mpi_comm_world_size_; i++) {
for(int i = 0; i < mpi_comm_world_size_; i++) {
// Shard buffers across GPUs
auto backend = clientGraphs_[i % devices_.size()]->getBackend();
Tensor accGrad = newTensor(nodeSizes_[i], backend);
Tensor accGradBuff = newTensor(nodeSizes_[i], backend);
accGradients.push_back(accGrad);
accGradientBuffer.push_back(accGradBuff);
}
}
}
}
@ -221,7 +221,7 @@ void MultiNodeGraphGroup::calculateShardSizes() {
*/
void MultiNodeGraphGroup::initShardGpuTensors() {
size_t offset = 0;
for (int i = 0; i < mpi_my_rank_; i++) {
for(int i = 0; i < mpi_my_rank_; i++) {
offset += nodeSizes_[i];
}
for(int shard = 0; shard < devices_.size(); shard++) {
@ -242,7 +242,8 @@ void MultiNodeGraphGroup::initShardGpuTensors() {
* updated parameters.
*/
void MultiNodeGraphGroup::launchServerThread() {
#if MPI_FOUND
// @TODO: move CUDA stuff into separate .cu files and remove '&& CUDA_FOUND'
#if MPI_FOUND && CUDA_FOUND
serverShardThread_ = new std::thread([this] {
// keep track of number of nodes still communicating with this shard
int nCommunicatingNodes = mpi_comm_world_size_;
@ -400,7 +401,8 @@ void MultiNodeGraphGroup::synchronizeWithServerShards(Tensor newGrads,
Tensor oldParams,
int gpu,
size_t batchWords) {
#if MPI_FOUND
// @TODO: move CUDA stuff into separate .cu files and remove '&& CUDA_FOUND'
#if MPI_FOUND && CUDA_FOUND
size_t offset = 0;
for(int node = 0; node < mpi_comm_world_size_; node++) {
size_t nodeSize = nodeSizes_[node];
@ -410,9 +412,10 @@ void MultiNodeGraphGroup::synchronizeWithServerShards(Tensor newGrads,
Tensor gradient;
// Delayed Gradient Update
if (tau_ > 1) {
if(tau_ > 1) {
std::lock_guard<std::mutex> guard(optDelayMutex_[node]);
accGradientBuffer[node]->copyFrom(newGrads->subtensor(offset, nodeSize));
accGradientBuffer[node]->copyFrom(
newGrads->subtensor(offset, nodeSize));
// Accumulate the gradient
using namespace functional;
Element(_1 += _2, accGradients[node], accGradientBuffer[node]);
@ -420,14 +423,14 @@ void MultiNodeGraphGroup::synchronizeWithServerShards(Tensor newGrads,
totalBatchWords[node] += batchWords;
delay_count[node]++;
if (delay_count[node] < tau_)
if(delay_count[node] < tau_)
continue;
delay_count[node] = 0;
gradient = accGradients[node];
batchWords = totalBatchWords[node];
} else {
} else {
gradient = newGrads->subtensor(offset, nodeSize);
}
}
// Copy grads from GPU to CPU (for MPI sending)
cudaMemcpy(clientCommBuffersCPU_[gpu].data(),
@ -455,7 +458,7 @@ void MultiNodeGraphGroup::synchronizeWithServerShards(Tensor newGrads,
MPI_TAG_GRAD_PUSH_,
MPI_COMM_WORLD);
// Reset total gradient and batch words
if (tau_ > 1) {
if(tau_ > 1) {
std::lock_guard<std::mutex> guard(optDelayMutex_[node]);
accGradients[node]->set(0);
totalBatchWords[node] = 0;
@ -554,9 +557,9 @@ void MultiNodeGraphGroup::execute(Ptr<data::Batch> batch) {
auto costNode = builder->build(graph, batch);
#if MPI_FOUND
if (t == 0) {
if(t == 0) {
MPI_Barrier(MPI_COMM_WORLD);
if (my_id != 0)
if(my_id != 0)
graph->params()->vals()->copyFrom(clientGraphs_[0]->params()->vals());
MPI_Barrier(MPI_COMM_WORLD);
}
@ -628,20 +631,19 @@ void MultiNodeGraphGroup::execute(Ptr<data::Batch> batch) {
// Wait until the thread that wants to do validation is finished.
clientThreadPool_->wait_for_one(lock);
if (options_->get<std::string>("cost-type") != "ce-sum")
if(options_->get<std::string>("cost-type") != "ce-sum")
cost /= tau_;
if (tau_ > 1) {
if(tau_ > 1) {
std::vector<size_t> fakeLength = {1, 1};
auto fb = data::CorpusBatch::fakeBatch(fakeLength,
num_seen_sentences,
NULL);
auto fb = data::CorpusBatch::fakeBatch(
fakeLength, num_seen_sentences, NULL);
fb->front()->setWords(num_seen_words);
scheduler_->update(cost, fb);
} else {
scheduler_->update(cost, batch);
}
num_seen_words = 0;
num_seen_sentences = 0;
cost = 0;
@ -653,11 +655,11 @@ void MultiNodeGraphGroup::execute(Ptr<data::Batch> batch) {
// a safe state.
clientThreadPool_->wait_for_others(lock);
#if MPI_FOUND
//wait until other nodes are ready
// wait until other nodes are ready
MPI_Barrier(MPI_COMM_WORLD);
// TODO: Saving is broken
//if(mpi_my_rank_ == 0 && scheduler_->saving())
// if(mpi_my_rank_ == 0 && scheduler_->saving())
// this->save(graph);
if(mpi_my_rank_ == 0 && scheduler_->validating())

View File

@ -2,6 +2,9 @@
#if MPI_FOUND
#include "mpi.h"
#endif
#ifdef CUDA_FOUND
#include "cuda_runtime.h"
#endif
@ -124,7 +127,7 @@ protected:
int mpi_comm_world_size_{1};
/**
* Flag to indicate that an MPI message contains message info
* Flag to indicate that an MPI message contains message info
* before sending the gradient (client -> server).
*/
static const int MPI_TAG_GRAD_PUSH_MSG_{0};
@ -233,7 +236,7 @@ protected:
/**
* LocalOptimizers related variables
*/
bool useLocalOpt_;
// bool useLocalOpt_;
/**
* Allocate new tensor on given GPU and store allocator.
@ -405,10 +408,10 @@ public:
MultiNodeGraphGroup(Ptr<Config> options)
: GraphGroup(options),
tau_{options_->get<size_t>("optimizer-delay")},
useLocalOpt_{options_->get<bool>("multi-node-local-optimizers")},
// useLocalOpt_{options_->get<bool>("multi-node-local-optimizers")},
clientCommOverlap{options_->get<bool>("multi-node-overlap")} {
// Set up devices for this node
setupMPI(); //Setup MPI before creating device vectors
setupMPI(); // Setup MPI before creating device vectors
std::vector<size_t> devices;
for(auto& d : options_->getDevices())
devices.push_back(d.no);
@ -526,8 +529,6 @@ public:
return GraphGroup::collectStats(clientGraphs_[0], clientBuilders_[0]);
}
virtual void finalize() {
finalized_ = true;
}
virtual void finalize() { finalized_ = true; }
};
}

View File

@ -0,0 +1,280 @@
#include "training/graph_group_multinode_sync.h"
#include "functional/functional.h"
#include "tensors/tensor_operators.h"
namespace marian {
void MultiNodeGraphGroupSync::updateMovingAverage(Tensor paramsAvg,
Tensor params,
size_t batches) {
using namespace functional;
float decay
= std::max(mvDecay_, 1.f - (float)(batches + 1) / (float)(batches + 10));
Element(_1 = ((1.f - decay) * _1) + (decay * _2), paramsAvg, params);
}
/**
* Set given scheduler to register training observers on the shard optimizers.
*/
void MultiNodeGraphGroupSync::setScheduler(Ptr<Scheduler> scheduler) {
scheduler_ = scheduler;
// optimizer has to be registered last to see a change of learning rate
scheduler_->registerTrainingObserver(scheduler_);
scheduler_->registerTrainingObserver(syncOptimizer_);
}
/**
* Allocate new tensor on given GPU and store allocator.
*/
Tensor MultiNodeGraphGroupSync::newTensor(int size, Ptr<Backend> backend) {
Tensor t;
Ptr<TensorAllocator> allocator = New<TensorAllocator>(backend);
allocator->reserveExact(size * sizeof(float));
allocator->allocate(t, {1, size});
allocators_.push_back(allocator);
return t;
}
/**
* Setup training environment and launch server thread and (if enabled) client
* communication overlap threads.
* Includes setting up MPI, node and shard sizes, clients, server shards and
* communication overlap stuff.
*/
void MultiNodeGraphGroupSync::init(Ptr<data::Batch> batch) {
// Setup clients and shards
setupClients(batch);
int network_size = clientGraphs_[0]->params()->vals()->size();
LOG(info, "model size = {} float params" , network_size);
if (movingAvg_)
paramsAvg_ = newTensor(network_size, clientGraphs_.back()->getBackend());
// setup sync sgd storage, We keep the summed gradient on Node 0
sumGradientBuffer = newTensor(network_size, clientGraphs_[0]->getBackend());
accGradientsSync = newTensor(network_size, clientGraphs_[0]->getBackend());
}
/**
* Initialize the CPU arrays, with pinned memory for faster CudaMemCpy operations.
* Requires the graph to be initialized first so we know its size
*/
void MultiNodeGraphGroupSync::initCPUArrays() {
accGradientsSync_cpu = std::vector<float>(clientGraphs_[0]->params()->vals()->size());
receiveBuffer_cpu = std::vector<float>(clientGraphs_[0]->params()->vals()->size());
}
/**
* Setup MPI world size and rank of this node.
*/
void MultiNodeGraphGroupSync::setupMPI() {
#if MPI_FOUND
MPI_Comm_size(MPI_COMM_WORLD, &mpi_comm_world_size_);
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_my_rank_);
#endif
}
/**
* Setup clients that will compute gradients and communicate them with the
* server shards.
* There is one client per GPU.
*/
void MultiNodeGraphGroupSync::setupClients(Ptr<data::Batch> batch) {
runBatchThroughClientGraphs(batch);
initCPUArrays();
}
/**
* Initialize the graphs (models) of all clients on this node with the given
* batch.
*/
void MultiNodeGraphGroupSync::runBatchThroughClientGraphs(Ptr<data::Batch> batch) {
for(int i = 0; i < devices_.size(); i++) {
THREAD_GUARD(clientBuilders_[i]->build(clientGraphs_[i], batch);
clientGraphs_[i]->forward();
clientGraphs_[i]->getBackend()->synchronize(););
}
}
/**
* Initialize variables required for overlapping client computations and
* communication.
* Includes summed and committed word counts, buffer flags, mutexes and
* condition variables.
*/
void MultiNodeGraphGroupSync::sumGRAD(Tensor gradient) {
std::lock_guard<std::mutex> guard(sumGradientMutex_);
sumGradientBuffer->copyFrom(gradient);
using namespace functional; //@TODO makes more sense to do that on the CPU i think
Element(_1 += _2, accGradientsSync, sumGradientBuffer);
}
/**
* If it's rank 0, it's a local update, if it's rank one it's remote
* send and receive. Make sure you only call from device 0.
*/
void MultiNodeGraphGroupSync::sendReceiveUpdateSync() {
#if MPI_FOUND
int network_size = accGradientsSync_cpu.size();
// Copy the data to the CPU
accGradientsSync->get(accGradientsSync_cpu);
// Wait until all nodes are ready
MPI_Barrier(MPI_COMM_WORLD);
int reduce_result = MPI_Allreduce(accGradientsSync_cpu.data(), //CPU buffers
receiveBuffer_cpu.data(),
network_size,
MPI_FLOAT,
MPI_SUM,
MPI_COMM_WORLD);
// Copy the data back to the GPU and do optimizer update
// Do update with last GPU to distribute the memory
clientGraphs_.back()->params()->grads()->set(receiveBuffer_cpu);
// Perform optimizer step
syncOptimizer_->update(clientGraphs_.back());
if(movingAvg_)
updateMovingAverage(
paramsAvg_, clientGraphs_.back()->params()->vals(),
scheduler_->numberOfBatches());
//Distribute the graph to the rest of the devices
std::vector<std::thread> threads;
for(int idx = 0; idx < devices_.size() - 1; idx++) {
threads.emplace_back(std::thread(
[=](int idx) {
clientGraphs_[idx]->params()->vals()->copyFrom(
clientGraphs_.back()->params()->vals());
},
idx));
}
for(auto&& t : threads) {
t.join();
}
//set the accumulating buffers to zero;
accGradientsSync->set(0);
std::fill(accGradientsSync_cpu.begin(), accGradientsSync_cpu.end(), 0);
std::fill(receiveBuffer_cpu.begin(), receiveBuffer_cpu.end(), 0);
#endif
}
/**
* Execute given batch on this node, pushing/pulling the resulting
* gradients/parameters to/from the server shards
* or -- if comm. overlap enabled -- to/from the communication buffers, summing
* gradients locally if the communication thread is busy
*
* @param batch Batch on which to perform forward and backward passes.
*/
void MultiNodeGraphGroupSync::execute(Ptr<data::Batch> fullBatch) {
if(!initialized_) {
init(fullBatch);
initialized_ = true;
}
std::vector<Ptr<data::Batch>> batches = fullBatch->split(devices_.size());
static int t = 0;
static float cost = 0;
static size_t num_seen_words = 0;
static size_t num_seen_sentences = 0;
{
auto task = [this, batches](int my_id) {
auto batch = batches[my_id];
auto graph = clientGraphs_[my_id];
auto builder = clientBuilders_[my_id];
auto costNode = builder->build(graph, batch);
if (t == 0) {
if (my_id != 0)
graph->params()->vals()->copyFrom(clientGraphs_[0]->params()->vals());
}
graph->forward();
{
std::lock_guard<std::mutex> guard(sumCostMutex_);
cost += costNode->scalar();
num_seen_words += batch->words();
num_seen_sentences += batch->size();
}
graph->backward();
graph->getBackend()->synchronize(); //@Alham do you know why we need this here?
sumGRAD(graph->params()->grads());
};
ThreadPool pool(devices_.size(), devices_.size());
for(int idx = 0; idx < devices_.size(); ++idx)
pool.enqueue(task, idx);
}
if (t % tau_ == 0)
sendReceiveUpdateSync();
t++;
// Run scheduler (if enabled)
if(t % tau_ == 0 && scheduler_) {
if (options_->get<std::string>("cost-type") != "ce-sum")
cost /= (tau_ * devices_.size());
if (tau_ > 1) {
std::vector<size_t> fakeLength = {1, 1};
auto fb = data::CorpusBatch::fakeBatch(fakeLength,
num_seen_sentences,
NULL);
fb->front()->setWords(num_seen_words);
scheduler_->update(cost, fb);
} else {
scheduler_->update(cost, fullBatch);
}
num_seen_words = 0;
num_seen_sentences = 0;
cost = 0;
if((scheduler_->saving() || scheduler_->validating())) {
#if MPI_FOUND
//wait until other nodes are ready
MPI_Barrier(MPI_COMM_WORLD);
// TODO: Saving is broken
//if(mpi_my_rank_ == 0 && scheduler_->saving())
// this->save(graph);
if(mpi_my_rank_ == 0 && scheduler_->validating()) {
// temporarily save current params
if(movingAvg_)
accGradientsSync->copyFrom(clientGraphs_[0]->params()->vals());
if(movingAvg_)
for(auto graph : clientGraphs_)
graph->params()->vals()->copyFrom(paramsAvg_);
scheduler_->validate(clientGraphs_);
if(movingAvg_)
for(auto graph : clientGraphs_)
graph->params()->vals()->copyFrom(accGradientsSync);
}
// inform other nodes to continue
MPI_Barrier(MPI_COMM_WORLD);
#endif
}
}
}
}

View File

@ -0,0 +1,305 @@
#pragma once
#if MPI_FOUND
#include "mpi.h"
#endif
#ifdef CUDA_FOUND
#include "cuda_runtime.h"
#endif
#include <condition_variable>
#include <future>
#include <thread>
#include <boost/filesystem.hpp>
#include <boost/thread/locks.hpp>
#include <boost/thread/shared_mutex.hpp>
#include "3rd_party/threadpool.h"
#include "training/graph_group.h"
namespace marian {
/**
* Multi-node graph group for asynchronous training over multiple
* machines each with one or multiple GPUs
*/
class MultiNodeGraphGroupSync : public GraphGroup {
public:
virtual void setScheduler(Ptr<Scheduler> scheduler);
protected:
////////////////////////////////////////////////////////////////////////////
// General variables.
/** Number of clients on nodes in MPI world (cluster). */
std::vector<int> numberClientsOfNodes_; //@TODO not used for now, but might
// be useful maybe?
/** Whether graph group has been properly initialized with a first batch. */
bool initialized_{false};
/** Memory allocators for tensors (GPUs). */
std::vector<Ptr<TensorAllocator>> allocators_;
////////////////////////////////////////////////////////////////////////////
// Client variables.
/** Graph builders for clients (which run forward and backward passes). */
std::vector<Ptr<models::ModelBase>> clientBuilders_;
/** Graphs of clients. */
std::vector<Ptr<ExpressionGraph>> clientGraphs_;
/** Devices (GPUs) on this node. */
std::vector<size_t> devices_;
/** Mutex to ensure clients are uniquely assigned to graphs and builders. */
std::mutex mutexClientInit_;
/** Mutex to avoid race conditions in scheduler. */
std::mutex schedulerMutex_;
/**
* Batch number counter used for evenly distributing mini-batches across
* nodes.
*/
size_t batchIter_ = 0;
////////////////////////////////////////////////////////////////////////////
// Communication variables.
/** MPI rank of this node. */
int mpi_my_rank_{0};
/** Number of nodes in MPI world (cluster). */
int mpi_comm_world_size_{1};
/**
* Variables for optimizer delay and synchronous SGD
*/
size_t tau_{1};
std::mutex sumGradientMutex_;
std::mutex updateParamsMutex_;
std::mutex sumCostMutex_;
Tensor accGradientsSync;
Tensor sumGradientBuffer;
Tensor paramsAvg_;
std::vector<float> accGradientsSync_cpu;
std::vector<float> receiveBuffer_cpu;
bool synchronization_happened{false};
Ptr<OptimizerBase> syncOptimizer_;
std::vector<std::mutex> optDelayMutex_;
std::vector<size_t> delay_count;
std::vector<int> totalBatchWords;
std::vector<Tensor> accGradients, accGradientBuffer;
bool movingAvg_{false};
float mvDecay_{1e-4};
/**
* Allocate new tensor on given GPU and store allocator.
*/
Tensor newTensor(int size, Ptr<Backend> backend);
/*
* exponential smoothing
*/
void updateMovingAverage(Tensor paramsAvg, Tensor params, size_t batches);
/**
* Setup training environment and launch server thread and (if enabled) client
* communication overlap threads..
* Includes setting up MPI, node and shard sizes, clients, server shards and
* communication overlap stuff.
*/
virtual void init(Ptr<data::Batch> batch);
/**
* Setup MPI world size and rank of this node.
*/
void setupMPI();
/**
* Setup clients that will compute gradients and communicate them with the
* server shards.
* There is one client per GPU.
*/
void setupClients(Ptr<data::Batch> batch);
/**
* Initialize the graphs (models) of all clients on this node with the given
* batch.
*/
void runBatchThroughClientGraphs(Ptr<data::Batch> batch);
/**
* Initialize the CPU arrays, with pinned memory for faster CudaMemCpy
* operations.
*/
void initCPUArrays();
/**
* Sums the gradients from a node, taking care of locking
* @param gradient - the gradient
*/
void sumGRAD(Tensor gradient);
/**
* Does the MPI Communication, parameter update and copying back parameters.
* @TODO ALHAM. God function too godly?
*/
void sendReceiveUpdateSync();
void execute(Ptr<data::Batch> batch);
/**
* Load the GPU configuration of this node (i.e. which GPUs to use) and the
* number of GPUs on the other nodes.
*/
void loadDeviceConfig(std::vector<size_t> deviceConfig) {
size_t index = 0, node = 0, nClientsSeen = 0;
numberClientsOfNodes_ = std::vector<int>(mpi_comm_world_size_, 0);
while(index < deviceConfig.size()) {
if(numberClientsOfNodes_[node] == 0) {
numberClientsOfNodes_[node] = deviceConfig[index];
nClientsSeen = 0;
} else if(nClientsSeen < numberClientsOfNodes_[node]) {
if(node == mpi_my_rank_) {
devices_.push_back(deviceConfig[index]);
}
nClientsSeen++;
} else {
node++;
index--;
}
index++;
}
}
public:
/**
* (Constructor) Call super class and initialize client graphs and builders.
*/
MultiNodeGraphGroupSync(Ptr<Config> options)
: GraphGroup(options),
tau_{options_->get<size_t>("optimizer-delay")},
movingAvg_{options_->get<float>("exponential-smoothing") > 0},
mvDecay_{options_->get<float>("exponential-smoothing")},
syncOptimizer_{Optimizer(options_)} {
// Set up devices for this node
setupMPI(); // Setup MPI before creating device vectors
std::vector<size_t> devices;
for(auto& d : options_->getDevices())
devices.push_back(d.no);
loadDeviceConfig(devices);
// Create builders and graphs for clients.
for(size_t i = 0; i < devices_.size(); i++) {
clientGraphs_.push_back(New<ExpressionGraph>());
clientGraphs_[i]->setDevice({devices_[i], DeviceType::gpu});
clientGraphs_[i]->reserveWorkspaceMB(options_->get<size_t>("workspace"));
clientBuilders_.push_back(
models::from_config(options_, models::usage::training));
}
}
/**
* Update any client model with given batch if batch is assigned to this node.
*/
void update(Ptr<data::Batch> batch) {
ABORT_IF(finalized_, "Training has already finished.");
if(batchIter_ % mpi_comm_world_size_
== mpi_my_rank_) { // Only take batch assigned to this node
execute(batch);
}
batchIter_++;
}
/**
* Load models from disk if file exists and setting is not disabled
*/
void load() {
if(!options_->get<bool>("no-reload")) {
std::string name = options_->get<std::string>("model");
if(boost::filesystem::exists(name)) {
if(scheduler_)
scheduler_->load(name);
size_t i = 0;
for(auto graph : clientGraphs_)
clientBuilders_[i++]->load(graph, name);
} else if(options_->has("pretrained-model")) {
std::string init = options_->get<std::string>("pretrained-model");
LOG(info,
"Initialize model weights with the pre-trained model {}",
init);
size_t i = 0;
for(auto graph : clientGraphs_)
clientBuilders_[i++]->load(graph, init, false);
}
}
}
/**
* Save model of first client's graph to disk
*/
void save(bool final = false) { save(clientGraphs_[0], final); }
/**
* Save model of given graph to disk.
*/
void save(Ptr<ExpressionGraph> graph, bool final = false) {
int idx = 0;
for(int i = 0; i < clientGraphs_.size(); ++i) {
if(graph == clientGraphs_[i]) {
idx = i;
break;
}
}
if(options_->get<bool>("overwrite")) {
std::string name = options_->get<std::string>("model");
clientBuilders_[idx]->save(clientGraphs_[idx], name, true);
if(scheduler_)
scheduler_->save(name);
} else {
std::string name = options_->get<std::string>("model");
if(!final) {
std::string numberOfBatches
= scheduler_ ? std::to_string(scheduler_->numberOfBatches())
: "unknown";
std::string nameOverwrite = name;
nameOverwrite.replace(
name.size() - 4, 4, ".iter" + numberOfBatches + ".npz");
clientBuilders_[idx]->save(clientGraphs_[idx], nameOverwrite);
}
clientBuilders_[idx]->save(clientGraphs_[idx], name, true);
if(scheduler_)
scheduler_->save(name);
}
}
/**
* Collect statistics from first client's graph.
*/
Ptr<data::BatchStats> collectStats() {
return GraphGroup::collectStats(
clientGraphs_[0], clientBuilders_[0], devices_.size());
}
virtual void finalize() {
finalized_ = true;
#if MPI_FOUND
MPI_Finalize();
#endif
}
};
}

View File

@ -4,6 +4,27 @@
namespace marian {
SyncGraphGroup::SyncGraphGroup(Ptr<Config> config)
: GraphGroup(config),
devices_{options_->getDevices()},
movingAvg_{options_->get<float>("exponential-smoothing") > 0},
mvDecay_{options_->get<float>("exponential-smoothing")},
delay_{options_->get<size_t>("optimizer-delay")} {
for(auto device : devices_) {
auto graph = New<ExpressionGraph>();
graph->setDevice(device);
graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
graphs_.push_back(graph);
shardOpt_.push_back(Optimizer(options_));
builders_.push_back(models::from_config(options_, models::usage::training));
}
comm_ = createCommunicator(graphs_, options_->get<bool>("no-nccl", false));
}
void SyncGraphGroup::setScheduler(Ptr<Scheduler> scheduler) {
scheduler_ = scheduler;
// optimizer has to be registered last to see changes of learning rate
@ -22,169 +43,133 @@ void SyncGraphGroup::updateMovingAverage(Tensor paramsAvg,
Element(_1 = ((1.f - decay) * _1) + (decay * _2), paramsAvg, params);
}
void SyncGraphGroup::fetchParams(Tensor oldParams,
const std::vector<Tensor>& params) {
// @TODO read guard on parameters
int pos = 0;
std::vector<std::thread> threads;
for(int idx = 0; idx < devices_.size(); idx++) {
threads.emplace_back(std::thread(
[=](int idx, int pos) {
oldParams->subtensor(pos, params[idx]->size())->copyFrom(params[idx]);
},
idx,
pos));
pos += shardSize_;
void SyncGraphGroup::initialize(const std::vector<Ptr<data::Batch>>& batches) {
// Initialize 0th graph with random weights in one forward step
{
THREAD_GUARD(builders_[0]->build(graphs_[0], batches[0]);
graphs_[0]->forward(););
// Copy weights from 0th graph to all other graphs
// to have equal weights across devices
ThreadPool pool(graphs_.size() - 1, graphs_.size() - 1);
for(size_t i = 1; i < graphs_.size(); ++i) {
auto init = [&](size_t i) {
// initialize i-th graph and weights
builders_[i]->build(graphs_[i], batches[0]);
graphs_[i]->forward();
// overwrite weights of i-th graph with weights from 0th graph
graphs_[i]->params()->vals()->copyFrom(graphs_[0]->params()->vals());
};
pool.enqueue(init, i);
}
}
for(auto&& t : threads) {
t.join();
if(movingAvg_ && paramsAvg_.size() == 0) {
int totalSize = graphs_[0]->params()->vals()->size();
shardSize_ = ceil(totalSize / (float)devices_.size());
int pos = 0;
for(auto graph : graphs_) {
int __size__ = std::min(shardSize_, totalSize);
auto paramsAlloc = New<TensorAllocator>(graph->getBackend());
paramsAllocs_.push_back(paramsAlloc);
paramsAlloc->reserveExact(__size__ * sizeof(float));
Tensor paramAvg;
paramsAlloc->allocate(paramAvg, {1, __size__});
paramsAvg_.push_back(paramAvg);
paramAvg->copyFrom(graphs_[0]->params()->vals()->subtensor(pos, __size__));
// move to next shard
pos += __size__;
totalSize -= __size__;
}
}
}
void SyncGraphGroup::execute(Ptr<data::Batch> fullBatch) {
std::vector<Ptr<data::Batch>> delayedBatches =
delay_ > 1 ?
fullBatch->split(delay_) :
std::vector<Ptr<data::Batch>>({ fullBatch });
void SyncGraphGroup::execute(Ptr<data::Batch> batch) {
size_t devs = devices_.size();
auto batches = batch->split(delay_ * devs);
float div = batches.size(); // no. of batches
// do not average gradients if cost type is sum.
if(options_->get<std::string>("cost-type") == "ce-sum")
div = 1;
std::vector<std::vector<Ptr<data::Batch>>> delayedBatches;
for(int i = 0; i < delay_; ++i) {
if(i * devs < batches.size()) {
delayedBatches.emplace_back();
for(int j = 0; j < devs; ++j) {
size_t index = i * devs + j;
if(index < batches.size())
delayedBatches.back().push_back(batches[i * devs + j]);
else
delayedBatches.back().push_back(nullptr);
}
}
}
std::vector<float> costs(devices_.size(), 0.f);
size_t t = 1;
for(auto batch : delayedBatches) {
std::vector<Ptr<data::Batch>> batches = batch->split(devices_.size());
for(const auto& curBatches : delayedBatches) {
if(first_) {
{
THREAD_GUARD(builders_[0]->build(graphs_[0], batches[0]);
graphs_[0]->forward(););
ThreadPool pool(graphs_.size() - 1, graphs_.size() - 1);
for(size_t i = 1; i < graphs_.size(); ++i) {
auto init = [&](size_t i) {
builders_[i]->build(graphs_[i], batches[0]);
graphs_[i]->forward();
graphs_[i]->params()->vals()->copyFrom(graphs_[0]->params()->vals());
};
pool.enqueue(init, i);
}
}
if(params_.size() == 0) {
int totalSize = graphs_[0]->params()->vals()->size();
shardSize_ = ceil(totalSize / (float)devices_.size());
int pos = 0;
for(auto graph : graphs_) {
int __size__ = std::min(shardSize_, totalSize);
auto paramsAlloc = New<TensorAllocator>(graph->getBackend());
paramsAllocs_.push_back(paramsAlloc);
paramsAlloc->reserveExact(3 * __size__ * sizeof(float));
Tensor param, grad, tmp;
paramsAlloc->allocate(param, {1, __size__});
paramsAlloc->allocate(grad, {1, __size__});
paramsAlloc->allocate(tmp, {1, __size__});
params_.push_back(param);
grad->set(0.f);
grads_.push_back(grad);
tmpTensors_.push_back(tmp);
param->copyFrom(graphs_[0]->params()->vals()->subtensor(pos, __size__));
pos += __size__;
totalSize -= __size__;
}
}
if(movingAvg_ && paramsAvg_.size() == 0) {
int totalSize = graphs_[0]->params()->vals()->size();
int i = 0;
for(auto graph : graphs_) {
int __size__ = std::min(shardSize_, totalSize);
totalSize -= __size__;
Tensor paramAvg;
auto allocator = New<TensorAllocator>(graph->getBackend());
allocator->reserveExact(__size__ * sizeof(float));
allocator->allocate(paramAvg, {1, __size__});
paramAvg->copyFrom(params_[i++]);
paramsAllocAvg_.push_back(allocator);
paramsAvg_.push_back(paramAvg);
}
}
initialize(curBatches);
first_ = false;
}
{
auto task = [this, &costs, batches](size_t idx) {
auto graph = graphs_[idx];
auto batch = batches[idx];
// Execute single forward/backward step
auto forwardBackward = [this, &costs, curBatches, t](size_t idx, int pos) {
auto graph = graphs_[idx];
auto batch = curBatches[idx];
if(batch->size() > 0) {
auto costNode = builders_[idx]->build(graph, batch);
graph->forward();
costs[idx] += costNode->scalar();
graph->backward();
}
};
if(batch) {
auto costNode = builders_[idx]->build(graph, batch);
graph->forward();
costs[idx] += costNode->scalar();
ThreadPool pool(devices_.size(), devices_.size());
for(int idx = 0; idx < batches.size(); ++idx)
pool.enqueue(task, idx);
}
{
auto task = [this, batches](size_t idx, int pos, bool update) {
int size = params_[idx]->size();
int i = 0;
float div = devices_.size(); // no. of GPUs
// do not average gradients if cost type is sum.
if(options_->get<std::string>("cost-type") == "ce-sum") {
div = 1;
}
for(auto graph : graphs_) {
if(batches[i]->size() > 0) {
auto subGrad = graph->params()->grads()->subtensor(pos, size);
tmpTensors_[idx]->copyFrom(subGrad);
using namespace functional;
Element(_1 = _1 + (_2 / div), grads_[idx], tmpTensors_[idx]);
}
i++;
}
if(update) {
shardOpt_[idx]->update(params_[idx], grads_[idx]);
grads_[idx]->set(0.f);
if(movingAvg_)
updateMovingAverage(
paramsAvg_[idx], params_[idx], scheduler_->numberOfBatches());
for(auto graph : graphs_) {
auto subParam = graph->params()->vals()->subtensor(pos, size);
subParam->copyFrom(params_[idx]);
}
}
};
ThreadPool pool(devices_.size(), devices_.size());
int pos = 0;
for(int idx = 0; idx < devices_.size(); ++idx) {
pool.enqueue(task, idx, pos, t == delay_);
pos += params_[idx]->size();
// only reset gradients to 0 if t == 1
graph->backward(t == 1);
}
else {
// handle case of empty batch, execute do-nothing fw-bw step for
// proper inits and resets.
graph->forward();
graph->backward(t == 1);
}
};
// Update parameter shard with gradient shard
auto update = [this, div](size_t idx, int pos) {
int totalSize = graphs_[0]->params()->vals()->size();
int shardSize = ceil(totalSize / (float)devices_.size());
int size = std::min(totalSize - pos, shardSize);
auto curGrad = graphs_[idx]->params()->grads()->subtensor(pos, size);
auto curParam = graphs_[idx]->params()->vals()->subtensor(pos, size);
if(div != 1) {
using namespace functional;
Element(_1 = _1 / div, curGrad);
}
shardOpt_[idx]->update(curParam, curGrad);
if(movingAvg_)
updateMovingAverage(paramsAvg_[idx], curParam, scheduler_->numberOfBatches());
};
comm_->foreach(forwardBackward);
if(t == delayedBatches.size()) {
comm_->scatterReduce();
comm_->foreach(update);
comm_->allGather();
}
t++;
@ -202,24 +187,108 @@ void SyncGraphGroup::execute(Ptr<data::Batch> fullBatch) {
}
if(scheduler_) {
scheduler_->update(cost, fullBatch);
scheduler_->update(cost, batches);
if(scheduler_->saving()) {
this->save();
}
if(scheduler_->validating()) {
if(movingAvg_)
for(auto graph : graphs_)
fetchParams(graph->params()->vals(), paramsAvg_);
if(movingAvg_) {
comm_->swapParams(paramsAvg_);
}
// safe, because all graphs are idle during validation with sync sgd
scheduler_->validate(graphs_);
if(movingAvg_)
for(auto graph : graphs_)
fetchParams(graph->params()->vals(), params_);
if(movingAvg_) {
comm_->swapParams(paramsAvg_);
}
}
}
}
void SyncGraphGroup::load() {
if(!options_->get<bool>("no-reload")) {
std::string name = options_->get<std::string>("model");
if(boost::filesystem::exists(name)) {
size_t i = 0;
if(scheduler_)
scheduler_->load(name);
for(auto graph : graphs_)
builders_[i++]->load(graph, name);
// @TODO: probably we want to have the list of DeviceIds as an attribute
std::vector<Ptr<Backend>> backends;
for(auto graph : graphs_)
backends.push_back(graph->getBackend());
shardOpt_[0]->load(name + ".optimizer.npz", shardOpt_, backends);
} else if(options_->has("pretrained-model")) {
std::string init = options_->get<std::string>("pretrained-model");
LOG(info,
"Initialize model weights with the pre-trained model {}",
init);
size_t i = 0;
for(auto graph : graphs_)
builders_[i++]->load(graph, init, false);
}
}
}
void SyncGraphGroup::save(bool final) {
if(final && scheduler_) {
if(movingAvg_ && paramsAvg_.size() > 0)
comm_->swapParams(paramsAvg_);
scheduler_->validate(graphs_, true);
if(movingAvg_ && paramsAvg_.size() > 0)
comm_->swapParams(paramsAvg_);
}
save(graphs_[0], final);
}
void SyncGraphGroup::save(Ptr<ExpressionGraph> graph, bool final) {
int idx = 0;
for(int i = 0; i < graphs_.size(); ++i) {
if(graph == graphs_[i]) {
idx = i;
break;
}
}
if(movingAvg_ && paramsAvg_.size() > 0)
comm_->swapParams(paramsAvg_);
std::string name = options_->get<std::string>("model");
if(options_->get<bool>("overwrite")) {
builders_[idx]->save(graphs_[idx], name, true);
if(scheduler_)
scheduler_->save(name);
} else {
if(!final) {
std::string numberOfBatches
= scheduler_ ? std::to_string(scheduler_->numberOfBatches())
: "unknown";
std::string nameOverwrite = name;
nameOverwrite.replace(
name.size() - 4, 4, ".iter" + numberOfBatches + ".npz");
builders_[idx]->save(graphs_[idx], nameOverwrite);
}
builders_[idx]->save(graphs_[idx], name, true);
if(scheduler_)
scheduler_->save(name);
}
if(movingAvg_ && paramsAvg_.size() > 0)
comm_->swapParams(paramsAvg_);
size_t totalSize = graphs_[idx]->params()->vals()->size();
shardOpt_[idx]->save(name + ".optimizer.npz", shardOpt_, totalSize);
}
}

View File

@ -4,6 +4,7 @@
#include "3rd_party/threadpool.h"
#include "training/graph_group.h"
#include "training/communicator.h"
namespace marian {
@ -12,26 +13,27 @@ public:
virtual void setScheduler(Ptr<Scheduler> scheduler);
private:
Ptr<Communicator> comm_;
std::vector<Ptr<models::ModelBase>> builders_;
std::vector<Ptr<ExpressionGraph>> graphs_;
std::vector<DeviceId> devices_;
std::vector<Tensor> params_;
std::vector<Tensor> grads_;
std::vector<Tensor> tmpTensors_;
std::vector<Ptr<TensorAllocator>> paramsAllocs_;
std::vector<Ptr<OptimizerBase>> shardOpt_;
int shardSize_;
bool first_{true};
std::vector<Tensor> paramsAvg_;
std::vector<Ptr<TensorAllocator>> paramsAllocAvg_;
std::vector<Ptr<TensorAllocator>> paramsAllocs_;
bool movingAvg_{false};
float mvDecay_{1e-4f};
size_t delay_{1};
void initialize(const std::vector<Ptr<data::Batch>>& batches);
void updateMovingAverage(Tensor paramsAvg, Tensor params, size_t batches);
void fetchParams(Tensor oldParams, const std::vector<Tensor>& params);
@ -39,117 +41,23 @@ private:
void execute(Ptr<data::Batch> batch);
public:
SyncGraphGroup(Ptr<Config> config)
: GraphGroup(config),
devices_{options_->getDevices()},
movingAvg_{options_->get<float>("exponential-smoothing") > 0},
mvDecay_{options_->get<float>("exponential-smoothing")},
delay_{options_->get<size_t>("optimizer-delay")} {
for(auto device : devices_) {
auto graph = New<ExpressionGraph>();
graph->setDevice(device);
graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
graph->getBackend()->setClip(options_->get<float>("clip-gemm"));
graphs_.push_back(graph);
shardOpt_.push_back(Optimizer(options_));
builders_.push_back(models::from_config(options_, models::usage::training));
}
}
SyncGraphGroup(Ptr<Config> config);
void update(Ptr<data::Batch> batch) {
ABORT_IF(finalized_, "Training has already finished.");
execute(batch);
}
void load() {
if(!options_->get<bool>("no-reload")) {
std::string name = options_->get<std::string>("model");
if(boost::filesystem::exists(name)) {
size_t i = 0;
if(scheduler_)
scheduler_->load(name);
for(auto graph : graphs_)
builders_[i++]->load(graph, name);
// @TODO: probably we want to have the list of DeviceIds as an attribute
std::vector<Ptr<Backend>> backends;
for(auto graph : graphs_)
backends.push_back(graph->getBackend());
shardOpt_[0]->load(name + ".optimizer.npz", shardOpt_, backends);
} else if(options_->has("pretrained-model")) {
std::string init = options_->get<std::string>("pretrained-model");
LOG(info,
"Initialize model weights with the pre-trained model {}",
init);
size_t i = 0;
for(auto graph : graphs_)
builders_[i++]->load(graph, init, false);
}
}
}
void save(bool final = false) {
if(final && scheduler_) {
if(movingAvg_ && paramsAvg_.size() > 0)
for(auto graph : graphs_)
fetchParams(graph->params()->vals(), paramsAvg_);
scheduler_->validate(graphs_, true);
if(movingAvg_ && paramsAvg_.size() > 0)
for(auto graph : graphs_)
fetchParams(graph->params()->vals(), params_);
}
save(graphs_[0], final);
}
void save(Ptr<ExpressionGraph> graph, bool final = false) {
int idx = 0;
for(int i = 0; i < graphs_.size(); ++i) {
if(graph == graphs_[i]) {
idx = i;
break;
}
}
if(movingAvg_ && paramsAvg_.size() > 0)
fetchParams(graphs_[idx]->params()->vals(), paramsAvg_);
std::string name = options_->get<std::string>("model");
if(options_->get<bool>("overwrite")) {
builders_[idx]->save(graphs_[idx], name, true);
if(scheduler_)
scheduler_->save(name);
} else {
if(!final) {
std::string numberOfBatches
= scheduler_ ? std::to_string(scheduler_->numberOfBatches())
: "unknown";
std::string nameOverwrite = name;
nameOverwrite.replace(
name.size() - 4, 4, ".iter" + numberOfBatches + ".npz");
builders_[idx]->save(graphs_[idx], nameOverwrite);
}
builders_[idx]->save(graphs_[idx], name, true);
if(scheduler_)
scheduler_->save(name);
}
if(movingAvg_ && paramsAvg_.size() > 0)
fetchParams(graphs_[idx]->params()->vals(), params_);
size_t totalSize = graphs_[idx]->params()->vals()->size();
shardOpt_[idx]->save(name + ".optimizer.npz", shardOpt_, totalSize);
}
void load();
void save(bool final = false);
void save(Ptr<ExpressionGraph> graph, bool final = false);
Ptr<data::BatchStats> collectStats() {
return GraphGroup::collectStats(graphs_[0], builders_[0], devices_.size() * delay_);
return GraphGroup::collectStats(graphs_[0], builders_[0], numBatches());
}
size_t numBatches() {
return devices_.size() * delay_;
}
virtual void finalize() {

View File

@ -154,10 +154,20 @@ public:
}
void update(float cost, Ptr<data::Batch> batch) {
update(cost, std::vector<Ptr<data::Batch>>({batch}));
}
void update(float cost, const std::vector<Ptr<data::Batch>>& batches) {
state_->validated = false;
auto batchSize = batch->size(); // number of sentences in batch
auto batchLabels = batch->words(-1); // number of target words in batch
auto batchSize = 0; // number of sentences in batch
auto batchLabels = 0; // number of target words in batch
for(const auto& batch : batches) {
batchSize += batch->size();
batchLabels += batch->words(-1);
}
// reconstruct sum cost, for displaying epoch-level averages instead of minibatch-level
auto costType = options_->get<std::string>("cost-type");
auto dispLabelCounts = options_->get<bool>("disp-label-counts"); // if true then show as "cost per label * number of labels"
@ -178,6 +188,7 @@ public:
state_->wordsDisp += batchLabels; // target words processed since last display, for speed display
state_->samplesEpoch += batchSize; // sentences processed in this epoch
state_->labelsTotal += batchLabels; // total labels processed
state_->newBatch();
if(state_->batches % options_->get<size_t>("disp-freq") == 0) {

View File

@ -14,7 +14,7 @@
#include "translator/beam_search.h"
#include "translator/history.h"
#include "translator/output_collector.h"
#include "translator/printer.h"
#include "translator/output_printer.h"
#include "translator/scorers.h"
namespace marian {
@ -294,9 +294,11 @@ public:
boost::timer::cpu_timer timer;
{
auto printer = New<OutputPrinter>(options_, vocabs_.back());
auto collector = options_->has("valid-translation-output")
? New<OutputCollector>(fileName)
: New<OutputCollector>(*tempFile);
if(quiet_)
collector->setPrintingStrategy(New<QuietPrinting>());
else
@ -329,7 +331,7 @@ public:
for(auto history : histories) {
std::stringstream best1;
std::stringstream bestn;
Printer(options_, vocabs_.back(), history, best1, bestn);
printer->print(history, best1, bestn);
collector->Write(history->GetLineNum(),
best1.str(),
bestn.str(),

View File

@ -36,16 +36,22 @@ public:
const Beams& beams,
std::vector<Ptr<ScorerState>>& states,
size_t beamSize,
bool first) {
bool first,
Ptr<data::CorpusBatch> batch) {
Beams newBeams(beams.size());
for(int i = 0; i < keys.size(); ++i) {
// keys is contains indices to vocab items in the entire beam.
// values can be between 0 and beamSize * vocabSize.
std::vector<float> alignments;
if(options_->get<float>("alignment", 0.f))
// Use alignments from the first scorer, even if ensemble
alignments = scorers_[0]->getAlignment();
for(int i = 0; i < keys.size(); ++i) {
// Keys contains indices to vocab items in the entire beam.
// Values can be between 0 and beamSize * vocabSize.
int embIdx = keys[i] % vocabSize;
int beamIdx = i / beamSize;
// retrieve short list for final softmax (based on words aligned
// Retrieve short list for final softmax (based on words aligned
// to source sentences). If short list has been set, map the indices
// in the sub-selected vocabulary matrix back to their original positions.
auto shortlist = scorers_[0]->getShortlist();
@ -72,6 +78,8 @@ public:
beamHypIdx = 0;
auto hyp = New<Hypothesis>(beam[beamHypIdx], embIdx, hypIdxTrans, cost);
// Set cost breakdown for n-best lists
if(options_->get<bool>("n-best")) {
std::vector<float> breakDown(states.size(), 0);
beam[beamHypIdx]->GetCostBreakdown().resize(states.size(), 0);
@ -82,12 +90,57 @@ public:
}
hyp->GetCostBreakdown() = breakDown;
}
// Set alignments
if(!alignments.empty()) {
auto align = getHardAlignmentsForHypothesis(
alignments, batch, beamSize, beamHypIdx, beamIdx);
hyp->SetAlignment(align);
}
newBeam.push_back(hyp);
}
}
return newBeams;
}
std::vector<float> getHardAlignmentsForHypothesis(
const std::vector<float> alignments,
Ptr<data::CorpusBatch> batch,
int beamSize,
int beamHypIdx,
int beamIdx) {
// Let's B be the beam size, N be the number of batched sentences,
// and L the number of words in the longest sentence in the batch.
// The alignment vector:
//
// if(first)
// * has length of N x L if it's the first beam
// * stores elements in the following order:
// beam1 = [word1-batch1, word1-batch2, ..., word2-batch1, ...]
// else
// * has length of N x L x B
// * stores elements in the following order:
// beams = [beam1, beam2, ..., beam_n]
//
// The mask vector is always of length N x L and has 1/0s stored like
// in a single beam, i.e.:
// * [word1-batch1, word1-batch2, ..., word2-batch1, ...]
//
size_t batchSize = batch->size();
size_t batchWidth = batch->width() * batchSize;
std::vector<float> align;
for(size_t w = 0; w < batchWidth / batchSize; ++w) {
size_t a = ((batchWidth * beamHypIdx) + beamIdx) + (batchSize * w);
size_t m = a % batchWidth;
if(batch->front()->mask()[m] != 0)
align.emplace_back(alignments[a]);
}
return align;
}
Beams pruneBeam(const Beams& beams) {
Beams newBeams;
for(auto beam : beams) {
@ -108,7 +161,9 @@ public:
Histories histories;
for(int i = 0; i < dimBatch; ++i) {
size_t sentId = batch->getSentenceIds()[i];
auto history = New<History>(sentId, options_->get<float>("normalize"), options_->get<float>("word-penalty"));
auto history = New<History>(sentId,
options_->get<float>("normalize"),
options_->get<float>("word-penalty"));
histories.push_back(history);
}
@ -183,8 +238,12 @@ public:
// BUGBUG: it's not cost but score (higher=better)
for(int i = 0; i < scorers_.size(); ++i) {
states[i] = scorers_[i]->step(
graph, states[i], hypIndices, embIndices, dimBatch, localBeamSize);
states[i] = scorers_[i]->step(graph,
states[i],
hypIndices,
embIndices,
dimBatch,
localBeamSize);
if(scorers_[i]->getWeight() != 1.f)
totalCosts
@ -219,14 +278,22 @@ public:
nth->getNBestList(beamSizes, totalCosts->val(), outCosts, outKeys, first);
int dimTrgVoc = totalCosts->shape()[-1];
beams = toHyps(
outKeys, outCosts, dimTrgVoc, beams, states, localBeamSize, first);
beams = toHyps(outKeys,
outCosts,
dimTrgVoc,
beams,
states,
localBeamSize,
first,
batch);
auto prunedBeams = pruneBeam(beams);
for(int i = 0; i < dimBatch; ++i) {
if(!beams[i].empty()) {
final = final
|| histories[i]->size() >= options_->get<float>("max-length-factor") * batch->front()->batchWidth();
|| histories[i]->size()
>= options_->get<float>("max-length-factor")
* batch->front()->batchWidth();
histories[i]->Add(beams[i], trgEosId_, prunedBeams[i].empty() || final);
}
}

View File

@ -24,6 +24,9 @@ public:
float GetCost() const { return cost_; }
std::vector<float>& GetCostBreakdown() { return costBreakdown_; }
std::vector<float>& GetAlignment() { return alignment_; }
void SetAlignment(const std::vector<float>& align) { alignment_ = align; };
private:
const Ptr<Hypothesis> prevHyp_;
@ -32,6 +35,7 @@ private:
const float cost_;
std::vector<float> costBreakdown_;
std::vector<float> alignment_;
};
typedef std::vector<Ptr<Hypothesis>> Beam;

View File

@ -0,0 +1,64 @@
#include "output_printer.h"
namespace marian {
std::vector<HardAlignment> OutputPrinter::getAlignment(
const Ptr<Hypothesis>& hyp,
float threshold) {
std::vector<SoftAlignment> alignSoft;
// Skip EOS
auto last = hyp->GetPrevHyp();
// Get soft alignments for each target word
while(last->GetPrevHyp().get() != nullptr) {
alignSoft.push_back(last->GetAlignment());
last = last->GetPrevHyp();
}
std::vector<HardAlignment> align;
// Alignments by maximum value
if(threshold == 1.f) {
for(size_t t = 0; t < alignSoft.size(); ++t) {
// Retrieved alignments are in reversed order
size_t rev = alignSoft.size() - t - 1;
size_t maxArg = 0;
for(size_t s = 0; s < alignSoft[0].size(); ++s) {
if(alignSoft[rev][maxArg] < alignSoft[rev][s]) {
maxArg = s;
}
}
align.push_back(std::make_pair(maxArg, t));
}
} else {
// Alignments by greather-than-threshold
for(size_t t = 0; t < alignSoft.size(); ++t) {
// Retrieved alignments are in reversed order
size_t rev = alignSoft.size() - t - 1;
for(size_t s = 0; s < alignSoft[0].size(); ++s) {
if(alignSoft[rev][s] > threshold) {
align.push_back(std::make_pair(s, t));
}
}
}
}
// Sort alignment pairs in ascending order
std::sort(align.begin(),
align.end(),
[](const HardAlignment& a, const HardAlignment& b) {
return (a.first == b.first) ? a.second < b.second
: a.first < b.first;
});
return align;
}
std::string OutputPrinter::getAlignmentString(
const std::vector<HardAlignment>& align) {
std::stringstream alignStr;
alignStr << " |||";
for(auto p = align.begin(); p != align.end(); ++p) {
alignStr << " " << p->first << "-" << p->second;
}
return alignStr.str();
}
}

View File

@ -0,0 +1,86 @@
#pragma once
#include <vector>
#include "common/config.h"
#include "common/utils.h"
#include "data/vocab.h"
#include "translator/history.h"
#include "translator/hypothesis.h"
namespace marian {
typedef std::vector<float> SoftAlignment;
typedef std::pair<size_t, size_t> HardAlignment;
class OutputPrinter {
public:
OutputPrinter(Ptr<Config> options, Ptr<Vocab> vocab)
: vocab_(vocab),
reverse_(options->get<bool>("right-left")),
nbest_(options->get<bool>("n-best", false)
? options->get<size_t>("beam-size")
: 0),
alignment_(options->get<float>("alignment", 0.f)) {}
template <class OStream>
void print(Ptr<History> history, OStream& best1, OStream& bestn) {
const auto& nbl = history->NBest(nbest_);
for(size_t i = 0; i < nbl.size(); ++i) {
const auto& result = nbl[i];
const auto& words = std::get<0>(result);
const auto& hypo = std::get<1>(result);
std::string translation = Join((*vocab_)(words), " ", reverse_);
bestn << history->GetLineNum() << " ||| " << translation;
if(alignment_ > 0.f) {
auto align = getAlignment(hypo, alignment_);
bestn << getAlignmentString(align);
}
bestn << " |||";
if(hypo->GetCostBreakdown().empty()) {
bestn << " F0=" << hypo->GetCost();
} else {
for(size_t j = 0; j < hypo->GetCostBreakdown().size(); ++j) {
bestn << " F" << j << "= " << hypo->GetCostBreakdown()[j];
}
}
float realCost = std::get<2>(result);
bestn << " ||| " << realCost;
if(i < nbl.size() - 1)
bestn << std::endl;
else
bestn << std::flush;
}
auto result = history->Top();
const auto& words = std::get<0>(result);
std::string translation = Join((*vocab_)(words), " ", reverse_);
best1 << translation;
if(alignment_ > 0.f) {
const auto& hypo = std::get<1>(result);
auto align = getAlignment(hypo, alignment_);
best1 << getAlignmentString(align);
}
best1 << std::flush;
}
private:
Ptr<Vocab> vocab_;
bool reverse_{false};
size_t nbest_{0};
float alignment_{0.f};
std::vector<HardAlignment> getAlignment(const Ptr<Hypothesis>& hyp,
float threshold);
std::string getAlignmentString(const std::vector<HardAlignment>& align);
};
}

View File

@ -1,35 +0,0 @@
#include "printer.h"
namespace marian {
std::vector<size_t> GetAlignment(const HypothesisPtr& hypothesis) {
std::vector<SoftAlignment> aligns;
HypothesisPtr last = hypothesis->GetPrevHyp();
while(last->GetPrevHyp().get() != nullptr) {
aligns.push_back(*(last->GetAlignment(0)));
last = last->GetPrevHyp();
}
std::vector<size_t> alignment;
for(auto it = aligns.rbegin(); it != aligns.rend(); ++it) {
size_t maxArg = 0;
for(size_t i = 0; i < it->size(); ++i) {
if((*it)[maxArg] < (*it)[i]) {
maxArg = i;
}
}
alignment.push_back(maxArg);
}
return alignment;
}
std::string GetAlignmentString(const std::vector<size_t>& alignment) {
std::stringstream alignString;
alignString << " |||";
for(size_t wordIdx = 0; wordIdx < alignment.size(); ++wordIdx) {
alignString << " " << wordIdx << "-" << alignment[wordIdx];
}
return alignString.str();
}
}

View File

@ -1,56 +0,0 @@
#pragma once
#include <vector>
#include "common/utils.h"
#include "data/vocab.h"
#include "translator/history.h"
namespace marian {
template <class OStream>
void Printer(Ptr<Config> options,
Ptr<Vocab> vocab,
Ptr<History> history,
OStream& best1,
OStream& bestn) {
bool reverse = options->get<bool>("right-left");
if(options->has("n-best") && options->get<bool>("n-best")) {
const auto& nbl = history->NBest(options->get<size_t>("beam-size"));
for(size_t i = 0; i < nbl.size(); ++i) {
const auto& result = nbl[i];
const auto& words = std::get<0>(result);
const auto& hypo = std::get<1>(result);
float realCost = std::get<2>(result);
std::string translation = Join((*vocab)(words), " ", reverse);
bestn << history->GetLineNum() << " ||| " << translation << " |||";
if(hypo->GetCostBreakdown().empty()) {
bestn << " F0=" << hypo->GetCost();
} else {
for(size_t j = 0; j < hypo->GetCostBreakdown().size(); ++j) {
bestn << " F" << j << "= " << hypo->GetCostBreakdown()[j];
}
}
bestn << " ||| " << realCost;
if(i < nbl.size() - 1)
bestn << std::endl;
else
bestn << std::flush;
}
}
auto bestTranslation = history->Top();
std::string translation
= Join((*vocab)(std::get<0>(bestTranslation)), " ", reverse);
best1 << translation << std::flush;
}
}

View File

@ -1,8 +1,9 @@
#pragma once
#include "marian.h"
#include "models/model_factory.h"
#include "data/shortlist.h"
#include "models/model_factory.h"
namespace marian {
@ -41,8 +42,10 @@ public:
virtual void init(Ptr<ExpressionGraph> graph) {}
virtual void setShortlistGenerator(Ptr<data::ShortlistGenerator> shortlistGenerator) {};
virtual void setShortlistGenerator(
Ptr<data::ShortlistGenerator> shortlistGenerator){};
virtual Ptr<data::Shortlist> getShortlist() { return nullptr; };
virtual std::vector<float> getAlignment() { return {}; };
};
class ScorerWrapperState : public ScorerState {
@ -104,97 +107,19 @@ public:
graph, wrappedState, hypIndices, embIndices, dimBatch, beamSize));
}
virtual void setShortlistGenerator(Ptr<data::ShortlistGenerator> shortlistGenerator) {
virtual void setShortlistGenerator(
Ptr<data::ShortlistGenerator> shortlistGenerator) {
encdec_->setShortlistGenerator(shortlistGenerator);
};
virtual Ptr<data::Shortlist> getShortlist() {
return encdec_->getShortlist();
};
};
//class WordPenaltyState : public ScorerState {
//private:
// int dimVocab_;
// Expr penalties_;
//
//public:
// WordPenaltyState(int dimVocab, Expr penalties)
// : dimVocab_(dimVocab), penalties_(penalties) {}
//
// virtual Expr getProbs() { return penalties_; };
//
// virtual float breakDown(size_t i) {
// return getProbs()->val()->get(i % dimVocab_);
// }
//};
//
//class WordPenalty : public Scorer {
//private:
// int dimVocab_;
// Expr penalties_;
//
//public:
// WordPenalty(const std::string& name, float weight, int dimVocab)
// : Scorer(name, weight), dimVocab_(dimVocab) {}
//
// virtual void clear(Ptr<ExpressionGraph> graph) {}
//
// virtual Ptr<ScorerState> startState(Ptr<ExpressionGraph> graph,
// Ptr<data::CorpusBatch> batch) {
// std::vector<float> p(dimVocab_, 1);
// p[0] = 0;
// p[2] = 0;
//
// penalties_ = graph->constant({1, dimVocab_}, inits::from_vector(p));
// return New<WordPenaltyState>(dimVocab_, penalties_);
// }
//
// virtual Ptr<ScorerState> step(Ptr<ExpressionGraph> graph,
// Ptr<ScorerState> state,
// const std::vector<size_t>& hypIndices,
// const std::vector<size_t>& embIndices,
// int dimBatch,
// int beamSize) {
// return state;
// }
//};
//
//class UnseenWordPenalty : public Scorer {
//private:
// int batchIndex_;
// int dimVocab_;
// Expr penalties_;
//
//public:
// UnseenWordPenalty(const std::string& name,
// float weight,
// int dimVocab,
// int batchIndex)
// : Scorer(name, weight), dimVocab_(dimVocab), batchIndex_(batchIndex) {}
//
// virtual void clear(Ptr<ExpressionGraph> graph) {}
//
// virtual Ptr<ScorerState> startState(Ptr<ExpressionGraph> graph,
// Ptr<data::CorpusBatch> batch) {
// std::vector<float> p(dimVocab_, -1);
// for(auto i : (*batch)[batchIndex_]->data())
// p[i] = 0;
// p[2] = 0;
//
// penalties_ = graph->constant({1, dimVocab_}, inits::from_vector(p));
// return New<WordPenaltyState>(dimVocab_, penalties_);
// }
//
// virtual Ptr<ScorerState> step(Ptr<ExpressionGraph> graph,
// Ptr<ScorerState> state,
// const std::vector<size_t>& hypIndices,
// const std::vector<size_t>& embIndices,
// int dimBatch,
// int beamSize) {
// return state;
// }
//};
virtual std::vector<float> getAlignment() {
return encdec_->getAlignment();
}
};
Ptr<Scorer> scorerByType(std::string fname,
float weight,

View File

@ -8,7 +8,7 @@
#include "3rd_party/threadpool.h"
#include "translator/history.h"
#include "translator/output_collector.h"
#include "translator/printer.h"
#include "translator/output_printer.h"
#include "models/model_task.h"
#include "translator/scorers.h"
@ -83,6 +83,7 @@ public:
size_t batchId = 0;
auto collector = New<OutputCollector>();
auto printer = New<OutputPrinter>(options_, trgVocab_);
if(options_->get<bool>("quiet-translation"))
collector->setPrintingStrategy(New<QuietPrinting>());
@ -111,7 +112,7 @@ public:
for(auto history : histories) {
std::stringstream best1;
std::stringstream bestn;
Printer(options_, trgVocab_, history, best1, bestn);
printer->print(history, best1, bestn);
collector->Write(history->GetLineNum(),
best1.str(),
bestn.str(),
@ -176,6 +177,7 @@ public:
data::BatchGenerator<data::TextInput> bg(corpus_, options_);
auto collector = New<StringCollector>();
auto printer = New<OutputPrinter>(options_, trgVocab_);
size_t batchId = 0;
// @TODO: unify this and get rid of Config object.
@ -205,7 +207,7 @@ public:
for(auto history : histories) {
std::stringstream best1;
std::stringstream bestn;
Printer(options_, trgVocab_, history, best1, bestn);
printer->print(history, best1, bestn);
collector->add(history->GetLineNum(), best1.str(), bestn.str());
}
};