merge with internal master

This commit is contained in:
Marcin Junczys-Dowmunt 2021-03-02 05:15:41 +00:00
commit 55a7047f8a
39 changed files with 2599 additions and 236 deletions

View File

@ -20,12 +20,17 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- Support for CUDA 11.
- General improvements and fixes for MPI handling, was essentially non-functional before (syncing, random seeds, deadlocks during saving, validation etc.)
- Allow to compile -DUSE_MPI=on with -DUSE_STATIC_LIBS=on although MPI gets still linked dynamically since it has so many dependencies.
- Fix building server with Boost 1.75
- Missing implementation for cos/tan expression operator
### Changed
- Change compile options a la -DCOMPILE_CUDA_SM35 to -DCOMPILE_KEPLER, -DCOMPILE_MAXWELL,
-DCOMPILE_PASCAL, -DCOMPILE_VOLTA, -DCOMPILE_TURING and -DCOMPILE_AMPERE
- Disable -DCOMPILE_KEPLER, -DCOMPILE_MAXWELL by default.
- Dropped support for legacy graph groups.
- Developer documentation framework based on Sphinx+Doxygen+Breathe+Exhale
- Expresion graph documentation (#788)
- Graph operators documentation (#801)
## [1.10.0] - 2021-02-06

View File

@ -169,7 +169,7 @@ SHORT_NAMES = NO
# description.)
# The default value is: NO.
JAVADOC_AUTOBRIEF = NO
JAVADOC_AUTOBRIEF = YES
# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
# line (until the first dot) of a Qt-style comment as the brief description. If

View File

@ -1 +1 @@
v1.10.0
v1.10.2

View File

@ -118,7 +118,6 @@ ModelState::SetMarianConfigPath()
// Set the Marian config path.
std::string config_path("/var/azureml-app/");
config_path.append(std::getenv("AZUREML_MODEL_DIR"));
config_path.append("/nlxseq2seq/triton/nlxseq2seq/1/data/model/");
config_path.append(config_filepath_str);
marian_config_path_ = config_path;
@ -199,6 +198,16 @@ ModelInstanceState::ModelInstanceState(
extern "C" {
void
handler(int sig) {
void* array[30];
size_t size = backtrace(array, 30);
fprintf(stderr, "Error: signal %d, Exception info:\n", sig);
backtrace_symbols_fd(array, size, STDERR_FILENO);
}
TRITONSERVER_Error*
TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model)
{
@ -209,6 +218,9 @@ TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model)
TRITONBACKEND_ModelSetState(model, reinterpret_cast<void*>(model_state))
);
signal(SIGSEGV, handler);
signal(SIGABRT, handler);
return nullptr; // success
}
@ -308,7 +320,6 @@ TRITONBACKEND_ModelInstanceExecute(
std::vector<TRITONBACKEND_Input*> request_input;
std::vector<int> request_batch_size;
std::vector<std::string> inputs;
std::string input_strings;
// Create a single response object for each request. If something
@ -389,14 +400,13 @@ TRITONBACKEND_ModelInstanceExecute(
}
content_buffer.insert(
content_buffer.end(), reinterpret_cast<const char*>(input_buffer) + 4,
reinterpret_cast<const char*>(input_buffer) + buffer_byte_size - 4
reinterpret_cast<const char*>(input_buffer) + buffer_byte_size
);
}
std::string s(content_buffer.begin(), content_buffer.end());
int count = std::count(s.begin(), s.end(), '\n');
request_batch_size.push_back(count + 1);
inputs.push_back(s);
content_buffer.clear();
if (input_strings.empty()) {
@ -433,12 +443,16 @@ TRITONBACKEND_ModelInstanceExecute(
if (output_content == nullptr) {
output_content = pos;
} else {
strcat(output_content, "\n");
strcat(output_content, pos);
// Replace the null terminator of the prev sentence with new line char
*(pos - 1) = '\n';
}
// Move to next output content.
if (p != nullptr) {
pos = p + 1;
} else {
// Break if there no left output content, even though batch_size > 0,
// '\n' at the end may be processed by Marian.
break;
}
batch_size--;
}
@ -567,4 +581,4 @@ TRITONBACKEND_ModelInstanceExecute(
} // extern "C"
}}} // namespace triton::backend::marian
}}} // namespace triton::backend::marian

View File

@ -1,4 +1,9 @@
#pragma once
#include <stdio.h>
#include <execinfo.h>
#include <signal.h>
#include <stdlib.h>
#include <unistd.h>
#ifdef _WIN32
#define DLLEXPORT extern "C" __declspec(dllexport)

4
doc/.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
api
build
doxygen
venv

23
doc/Makefile Normal file
View File

@ -0,0 +1,23 @@
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
SPHINXBUILD ?= sphinx-build
SOURCEDIR = .
BUILDDIR = build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: clean help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
# Clean target as recommended by Exhale
# https://exhale.readthedocs.io/en/latest/usage.html#optional-create-a-proper-clean-target
clean:
rm -rf doxygen/ api/
@$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

51
doc/README.md Normal file
View File

@ -0,0 +1,51 @@
# Marian NMT code documentation and library API
This directory contains code documentation and library API for developers of Marian NMT.
The documentation is generated using
[Sphinx](https://www.sphinx-doc.org/en/master/usage/quickstart.html) +
[Breathe](https://breathe.readthedocs.io/en/latest/directives.html) +
[Doxygen](http://www.doxygen.nl/manual/docblocks.html) +
[Exhale](https://exhale.readthedocs.io/en/latest/usage.html).
The documentation source code is written in `.rst` or `.md` files with special directives that allow
to reference to C++ source code and documentation. The source documents are then build into static
HTML pages.
## Installation
On Ubuntu 20.04, install the following packages:
sudo apt-get install python3 python3-pip python3-setuptools doxygen
Then set up a Python environment and install modules:
pip3 install virtualenv
virtualenv venv -p python3
source venv/bin/activate
pip install -r requirements.txt
Documentation building should also work on Windows, but it has not been tested.
## Generation
The documentation can be generated by running:
make html
The website will be generated into `build/html` and accessible by opening _index.html_ in your
browser.
Directories:
- `build` - automatically output directory for HTML documentation
- `doxygen` - automatically generated Doxygen XML files
- `api` - automatic library API generated with Exhale
- `.rst` and `.md` files in this directory and its subdirectories are documentation source files
- `_static` - custom CSS and JavaScript files
## Writing documentation
To be documented...

4
doc/_static/css/custom.css vendored Normal file
View File

@ -0,0 +1,4 @@
.wy-body-for-nav > .wy-grid-for-nav > .wy-nav-side {
border-bottom: 5px solid #28bbee;
/*background-color: #494d55;*/
}

120
doc/conf.py Normal file
View File

@ -0,0 +1,120 @@
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
import os
import datetime
import sys
sys.path.insert(0, os.path.abspath('.'))
# -- Project information -----------------------------------------------------
project = 'Marian NMT'
copyright = '2021, Marian NMT Team'
author = 'Marian NMT Team'
# The full version, including alpha/beta/rc tags
# TODO: add GitHub commit hash to the version
version_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'VERSION')
with open(os.path.abspath(version_file)) as f:
version = f.read().strip()
release = version + ' ' + str(datetime.date.today())
# -- General configuration ---------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
'sphinx.ext.imgmath',
'sphinx.ext.todo',
'breathe',
'exhale',
'recommonmark',
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = [
'build',
'doxygen',
'venv',
'README.md',
]
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'
htmlhelp_basename = 'marian'
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
html_css_files = ['css/custom.css']
# The base URL which points to the root of the HTML documentation
html_baseurl = 'http://marian-nmt.github.io/docs/api'
# -- Extension configuration -------------------------------------------------
breathe_projects = { 'marian': './doxygen/xml' }
breathe_default_project = 'marian'
doxygen_config = """
INPUT = ../src
EXCLUDE += ../src/3rd_party
EXCLUDE += ../src/tests
EXCLUDE_PATTERNS = *.md *.txt
FILE_PATTERNS += *.cu
EXTENSION_MAPPING += cu=C++ inc=C++
ENABLE_PREPROCESSING = YES
JAVADOC_AUTOBRIEF = YES
WARN_IF_UNDOCUMENTED = NO
"""
exhale_args = {
'containmentFolder' : './api',
'rootFileName' : 'library_index.rst',
'rootFileTitle' : 'Library API',
'doxygenStripFromPath' : '..',
'createTreeView' : True,
'exhaleExecutesDoxygen' : True,
'exhaleDoxygenStdin' : doxygen_config.strip(),
}
primary_domain = 'cpp'
highlight_language = 'cpp'
# A trick to include markdown files from outside the source directory using
# 'mdinclude'. Warning: all other markdown files not included via 'mdinclude'
# will be rendered using recommonmark as recommended by Sphinx
from m2r import MdInclude
def setup(app):
# from m2r to make `mdinclude` work
app.add_config_value('no_underscore_emphasis', False, 'env')
app.add_config_value('m2r_parse_relative_links', False, 'env')
app.add_config_value('m2r_anonymous_references', False, 'env')
app.add_config_value('m2r_disable_inline_math', False, 'env')
app.add_directive('mdinclude', MdInclude)

1
doc/contributing.rst Normal file
View File

@ -0,0 +1 @@
.. mdinclude:: ../CONTRIBUTING.md

406
doc/graph.md Normal file
View File

@ -0,0 +1,406 @@
# Expression graphs
The design of the deep learning framework in Marian is based on reverse-mode [auto-differentiation](https://en.wikipedia.org/wiki/Automatic_differentiation) (also known as backpropagation) with dynamic computation graphs.
Computation graphs allow a great deal of freedom in network architectures, and they can deal with complicated structures like conditions and loops.
The dynamic declaration, which means a new graph is created for each training instance (for a training example or a batch), is also advantageous.
It allows handling of variably sized inputs, as well as the cases where the graph may change depending on the results of previous steps.
Compared to static declaration, a dynamic computation graph could be expensive in terms of creating and optimising computation graphs.
Marian uses careful memory management to remove overhead in computation graph construction, and supports efficient execution on both CPU and GPU.
The main implementation of computation graph is in under [`src/graph`](api/dir_src_graph.html#dir-src-graph) directory.
Building blocks for graphs:
- [graph construction](#graph-construction)
- [node types](#node-types)
- [graph execution](#graph-execution)
## Graph construction
What is a computation graph?
All the numerical computations are expressed as a computation graph.
A computation graph (or graph in short) is a series of operations arranged into a graph of nodes.
To put it simply, a graph is just an arrangement of nodes that represent what you want to do with the data.
**Example 1**
Suppose you want to calculate the expression: `z=x*y+sin(x)`.
The computation graph of this expression is something like Figure 1.
![fig1](images/graph_example1.jpg "Figure 1 An example of computation graph")
*Figure 1 An example of computation graph*
In Marian, the `ExpressionGraph` class is the main implementation of a computation graph.
An `ExpressionGraph` object keeps a record of data (tensors) and all operations in a directed graph consisting of `Node` objects.
A `Node` is the basic unit of a graph. It can be an operation (e.g., dot()), or a tensor.
Each operation in a graph is a `NaryNodeOp` (a child of `Node` class).
Each operation defines its forward and backward steps.
Except for operations, a Node can also be a constant tensor (`ConstantNode`) or a parameter tensor (`ParamNode`).
To create a graph, we use `New<>` shortcut in place of regular constructors:
```cpp
// create a graph
auto graph = New<ExpressionGraph>();
```
After creating a graph, we also need to initialise the graph object with device options by `setDevice()` and workspace memory by `reserveWorkspaceMB()`, otherwise the program will result in a crash.
```cpp
// initialise graph with device options
// here we specify device no. is 0
// device type can be DeviceType::cpu or DeviceType::gpu
graph->setDevice({0, DeviceType::cpu});
// preallocate workspace memory (MB) for the graph
graph->reserveWorkspaceMB(128);
```
The _workspace memory_ means the size of the memory available for the forward and backward step of the training procedure.
This does not include model size and optimizer parameters that are allocated outsize workspace.
Hence you cannot allocate all device memory to the workspace.
To create a graph, Marian offer a set of shortcut functions that implements the common expression operators for a neural network (see [`src/graph/expression_operators.h`](api/program_listing_file_src_graph_expression_operators.h.html)), such as `affine()`.
These functions actually construct the corresponding operation nodes in the graph, make links with other nodes.
E.g., `affine()` construct a `AffineNodeOp` node in the graph.
Thus, building a graph turns into a simple task of defining expressions by using those functions.
**Building graph of Example 1 using Marian**
The following code is used to build the graph in Example 1 with inputs `x=2` and `y=3`.
```cpp
// create and initialise a graph object
auto graph = New<ExpressionGraph>();
graph->setDevice({0, DeviceType::cpu});
graph->reserveWorkspaceMB(8);
// add input node x
auto x = graph->constant({1,1}, inits::fromValue(2));
// add input node y
auto y = graph->constant({1,1}, inits::fromValue(3));
// define expression
auto mulOp = x*y;
auto sinOp = sin(x);
auto z = mulOp + sinOp;
// You can also define this expression: auto z = x*y + sin(x);
```
For the above example, `constant()` is used to construct a constant node (a tensor) in the graph as the input.
We will give more details about this function in the next section [**Node types**](#node-types).
The operators `*`, `+` and function `sin()` add corresponding operation nodes (i.e., `MultNodeOp` and `SinNodeOp`) in the graph.
To check the graph, Marian offers `graphviz()` function to generate graph layout in Graphviz format for visualisation.
This visualisation might not be practical for real-size graphs due to an enormous number of nodes and layers.
You can print the graph layout on console by running the following code:
```cpp
// print the graph layout on console
std::cout<<graph->graphviz()<<std::endl;
```
**Graph visualisation of Example 1**
The resulting graph is shown in Figure 2. Here we use an online Graphviz editor [edotor](https://edotor.net/) to generate the graph (by pasting the output of `graphviz()`).
![fig2](images/example1_dot.png "Figure 2 Graph layout of Example 1")
*Figure 2 Graph layout of Example 1*
In Figure 2, there are two numbers (between the pair of parentheses) in each node.
The first number indicates the node ID, and the second number specifies whether the node is trainable (0 means no; 1 means yes).
We will cover the concept of *trainable* in [**ParamNode section**](#paramnode).
One thing to notice here is that Marian adopts dynamic computation graphs;
this means that the nodes will be consumed once performing forward or backwards pass.
Thus, we need to call `graphviz()` function before performing the computation.
## Node types
As mentioned earlier, `Node` is the basic unit of a graph.
Each `Node` defines its forward steps in `Node::forward()` and backward steps in `Node::backward()`.
To access the resulting new tensor in the forward pass, we can call `Node::val()`.
While `Node::grad()` returns the accumulated gradients (a tensor) in the backward pass.
There are three main classes of Node in Marian: `ConstantNode`, `ParamNode` and `NaryNodeOp`.
### ConstantNode
The `ConstantNode` class is used to construct a constant node in the graph.
A constant node is actually a constant tensor whose value is immutable during the training.
A `ConstantNode` instance is usually used to construct the input layer.
To construct a constant node in the graph, we can use `constant()` function in the `ExpressionGraph` class.
We need to specify the shape and element type for the constant node.
For the shape, we can initialise a `Shape` instance in the way of vector initialisation.
E.g., `Shape shape={2,3};` this means 2D matrix with `dim[0]`=2 and `dim[1]`=3.
The element type must be one of the values stored in `Type` enumeration.
`Type` stores all supported data type in Marian, e.g., `Type::float16`.
If the type is not specified, the default type of graph will be used.
The default type of the graph is usually `Type::float32` unless you change it by `setDefaultElementType()`.
```cpp
// construct a constant node in the graph with default type
auto x = graph->constant({N, NUM_FEATURES}, inits::fromVector(inputData));
```
For the above example, the shape of the constant node is `{N, NUM_FEATURES}`, and the value of the constant node is initialised from a vector `inputData`.
`inits::fromVector()` returns a `NodeInitializer` which is a functor used to initialise a tensor by copying from the given vector.
More functions used to initialise a node can be found in [`src/graph/node_initializers.h`](api/namespace_marian__inits.html#namespace-marian-inits) file.
Marian also provides some shortcut functions to construct special constant nodes, such as `ones()` and `zeros()`:
```cpp
// construct a constant node with 1
auto ones = graph()->ones({10,10});
// construct a constant node with 0
auto zeros = graph()->zeros({10,10});
```
### ParamNode
`ParamNode` is used to store model parameters whose value can be changed during the training, such as weights and biases.
In addition to the shape and the element type, we need to specify whether a `ParamNode` object is _trainable_ or not.
If a parameter node is _trainable_, then its value will be tracked and updated during the training procedure.
For a `ParamNode`, the default value of `trainable_` is `true`.
We can define whether this parameter node is trainable by `Node::setTrainable()` function.
To construct a parameter node in the graph, we use the `param()` function in the `ExpressionGraph` class.
For a parameter node, we need to specify its name.
```cpp
// construct a parameter node called W1 in the graph
auto W1 = graph->param("W1", {NUM_FEATURES, 5}, inits::uniform(-0.1f, 0.1f));
```
The parameter node `W1` has a shape of `{NUM_FEATURES, 5}`, and is initialised with random numbers from the uniform distribution `Uniform(-0.1, 0.1)`.
### NaryNodeOp
`NaryNodeOp` is the base class that defines the operations in a graph.
It mainly contains unary and binary operators.
Each `NaryNodeOp` defines its forward operations in `Node::forwardOps()` and backward operations in `Node::backwardOps()`.
In the current version of Marian, we provide a set of common operations (inherited from `NaryNodeOp`) used to build a neural network,
such as `AffineNodeOp` (affine transformation), `CrossEntropyNodeOp` (cross-entropy loss function) and `TanhNodeOp` (tanh activation function).
As mentioned earlier, Marian implements a set of APIs that can easily add operations to the graph.
E.g., we can use `affine()` to perform affine transformation and then `tanh()` to perform tanh activation function on the results:
```cpp
// perform affine transformation: x*W1+b
// and then perform tanh activation function
auto h = tanh(affine(x, W1, b1));
```
In the above example, `affine()` and `tanh()` actually add `AffineNodeOp` and `TanhNodeOp` nodes to the graph.
For more shortcut functions used to add operations in the graph, you can find in [`src/graph/expression_operators.h`](api/program_listing_file_src_graph_expression_operators.h.html) file.
## Graph execution
Once you finish building a graph by adding all the nodes, now you can perform the real computation.
### Forward pass
The forward pass refers to the calculation process.
It traverses through all nodes from the input layer (leaves) to the output layer (root).
To perform the forward pass, you can call the function `forward()`. The `forward()` function mainly does two things:
- allocates memory for each node (`Node::allocate()`)
- computing the new tensor for each node by performing required operations (`Node::forward()`), and the resulting new tensor is stored in `val_` attribute in each Node.
**Forward pass of Example 1**
To run the forward pass of Example 1, you can run the following code:
```cpp
// Perform the forward pass on the nodes of the graph
graph->forward();
// get the computation result of z
std::vector<float> w;
z->val()->get(w);
std::cout<<"z="<<w[0]<<std::endl;
// The output is: z=6.9093
```
### **Backward pass**
The backward pass refers to the process of computing the output error.
It traverses through all *trainable* nodes from the output layer to the input layer.
You can call `backward()` to perform the backward pass.
The `backward()` function mainly computes the gradients using the chain rule:
- allocates memory and initialise gradients for each *trainable* Node
- computes the gradients based on backward steps (`Node::backwardOps()`) from each Node, and stores them in `adj_` attribute in each Node
- using the chain rule, propagates all the way to the input layer
We also provide a shortcut function `backprop()` which performs first the forward pass and then the backward pass on the nodes of the graph:
```cpp
// Perform backpropagation on the graph
graph->backprop();
// This function is equal to the following code:
/*
graph->forward();
graph->backward();
*/
```
**Backward pass of modified Example 1**
As shown in Figure 2, there is no trainable node in the graph of Example 1;
this means we cannot perform backwards pass on this graph.
To demonstrate the backward pass, we modify Example 1 by changing the constant node `x` to a parameter node (change `constant()` to `param()`).
Here is the modification:
```cpp
// add parameter node x
auto x = graph->param("x", {1,1}, inits::fromValue(2));
```
The resulting graph is also different as displayed in Figure 3.
![fig3](images/example1_dot2.png "Figure 3 Graph layout of modified Example 1")
*Figure 3 Graph layout of modified Example 1*
To perform the backward pass of modified Example 1, you can run the following code:
```cpp
// Perform the backward pass on the trainable nodes of the graph
graph->backward();
// get the gradient of x node
std::vector<float> b;
x->grad()->get(b);
std::cout<<"dz/dx="<<b[0]<<std::endl;
// The output is: dz/dx=2.58385
```
### Optimiser
After the backward pass, we obtain the gradients of the leaves.
However, the job is not done yet.
To train a model, we need to update the model parameters according to the gradients.
This comes to how we define the loss function and optimiser for the graph.
A loss function is used to calculate the model error between the predicted value and the actual value.
The goal is to minimise this error during training.
In a graph, the loss function is also represented as a group of node(s).
You can also use the operators provided in [`expression_operators.h`](api/program_listing_file_src_graph_expression_operators.h.html) file to define the loss function.
E.g., Marian offers `cross_entropy()` function to compute the cross-entropy loss between true labels and predicted labels.
**Define a loss function for modified Example 1**
Suppose we know the actual value of `z` is 6 with `y = 3`, and `x` is the parameter we would like to learn from the model.
The loss function we choose here is the absolute error:
```cpp
// pass the actual value to the model
auto actual = graph->constant({1,1}, inits::fromValue(6));
// define loss function
auto loss = abs(actual-z);
```
The graph is changed to Figure 4.
![fig4](images/example1_dot3.png "Figure 4 Graph layout of modified Example 1 with loss function")
*Figure 4 Graph layout of modified Example 1 with loss function*
The purpose of the optimiser is to adjust the variables to fit the data.
In Marian, there are three built-in optimiser classes: `Sgd`, `Adagrad` and `Adam`.
`Sgd` is an optimiser based on [stochastic gradient descent](https://en.wikipedia.org/wiki/Stochastic_gradient_descent).
For each iteration, it updates the parameter `w` according to the rule of `w = w - learning_rate * gradient`.
`Adagrad` implements [Adagrad algorithm](https://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf),
an optimiser with parameter-specific learning rates, which are adapted relative to how frequently a parameter gets updated during training.
`Adam` is an implementation of the [Adam algorithm](https://arxiv.org/abs/1412.6980),
a stochastic gradient descent method that is based on an adaptive estimation of first-order and second-order moments. .
We use `Optimizer<>` to set up an optimiser with the learning rate:
```cpp
// Choose optimizer (Sgd, Adagrad, Adam) and initial learning rate
auto opt = Optimizer<Adam>(0.01);
```
After an iteration of backpropagation, we can call `update()` function to update the parameters:
```cpp
// update parameters in the graph
opt->update(graph);
```
**Set up an optimiser for modified Example 1**
Continue with Example 1, we choose `Sgd` as the optimiser and update the parameter `x`:
```cpp
// set up Sgd optimiser with 0.005 learning rate
auto opt = Optimizer<Sgd>(0.005);
// update parameters
opt->update(graph);
// get the new value of x
std::vector<float> v;
x->val()->get(v);
std::cout<<"x="<<v[0]<<std::endl;
// The output is: x=1.98708
```
### Debugging
For debugging, we can call `debug()` to print node parameters. The `debug()` function has to be called prior to graph execution.
Once a node is marked for debugging, its value (resulting tensor) and the gradient will be printed out during the forward and backward pass.
It is also recommended to turn on Marian logger by calling `createLoggers()` for more information.
**Debugging for modified Example 1**
Suppose we want to check the results of node `x` during the computation. We can call `debug()` to mark node `x` for debugging.
```cpp
// mark node x for debugging with logging message "Parameter x"
debug(x, "Parameter x");
```
The output is shown as follows with `createLoggers()`:
```cpp
[2021-02-16 15:10:51] [memory] Reserving 256 B, device gpu0
[2021-02-16 15:10:51] Debug: Parameter x op=param
[2021-02-16 15:10:51] shape=1x1 size=1 type=float32 device=gpu0 ptr=140505547538432 bytes=256
min: 2.00000000 max: 2.00000000 l2-norm: 2.00000000
[[ 2.00000000 ]]
[2021-02-16 15:10:51] [memory] Reserving 256 B, device gpu0
[2021-02-16 15:10:51] Debug Grad: Parameter x op=param
[2021-02-16 15:10:51] shape=1x1 size=1 type=float32 device=gpu0 ptr=140505547538944 bytes=256
min: 2.58385324 max: 2.58385324 l2-norm: 2.58385324
[[ 2.58385324 ]]
```
### More advanced
For more details about graph execution, a graph keeps track of all the `Node` objects in its `nodesForward_` and `nodesBackward_` lists.
`nodesForward_` contains all nodes used for the forward pass and `nodesBackward_` contains all trainable nodes used for the backward pass.
All the tensor objects for a graph are stored in its `tensors_` attribute.
`tensors_` is a shared pointer holding memory and nodes for a graph.
Since each `Node` can result in new tensors, this attribute is used to allocate memory for new tensors during the forward and backward pass.
This `tensors_` attribute gets cleared before a new graph is built.
Another important attribute in `ExpressionGraph` is `paramsByElementType_`.
This attribute holds memory and nodes that correspond to graph parameters.
You can call `params()` function in a graph to get all the parameter objects:
```cpp
// return the Parameters object related to the graph
// The Parameters object holds the whole set of the parameter nodes.
graph->params();
```
Besides, we provide APIs to support the mechanism of Gradient Checkpointing.
This method works by trading compute for memory, which reruns a forward-pass segment for each checkpoint segment during the backward pass.
Currently, Marian only supports setting checkpoint nodes manually by calling `Node::markCheckpoint()` or `checkpoint()`.
To enable the gradient-checkpointing mode for a graph, we use `setCheckpointing()`:
```cpp
// enable gradient-checkpointing for a graph
graph->setCheckpointing(true);
```
We can also save and load the parameters of a graph in Marian.
We can call `save()` to save all parameters in the graph into a file (`.npz` or `.bin` format).
The function `load()` can load all model parameters to the graph (either from an array of `io::Items`, a file or a buffer).
```cpp
// specify the filename
std::string filename = "my_model.npz";
// save all the parameters into a file
graph->save(filename);
// load model from a file
graph->load(filename);
```

BIN
doc/images/example1_dot.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.4 KiB

BIN
doc/images/example2.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.9 KiB

47
doc/index.rst Normal file
View File

@ -0,0 +1,47 @@
Welcome to Marian's documentation!
==================================
|buildgpu| |buildcpu| |tests| |release| |license|
Marian is an efficient and self-contained Neural Machine Translation framework with an integrated
automatic differentiation engine based on dynamic computation graphs, written entirely in C++.
This is developer documentation. User documentation is available at https://marian-nmt.github.io/docs/
.. toctree::
:maxdepth: 2
:caption: Contents:
graph
operators
api/library_index
contributing
Indices and tables
------------------
* :ref:`genindex`
.. |buildgpu| image:: https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-dev-cuda-10.1.svg?label=CUDAC%20Build
:target: http://vali.inf.ed.ac.uk/jenkins/job/marian-dev/
:alt: GPU build status
.. |buildcpu| image:: https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-dev-cpu.svg?label=CPU%20Build
:target: http://vali.inf.ed.ac.uk/jenkins/job/marian-dev-cpu/
:alt: CPU build status
.. |tests| image:: https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-regression-tests.svg?label=Tests
:target: http://vali.inf.ed.ac.uk/jenkins/job/marian-regression-tests/
:alt: Tests status
.. |release| image:: https://img.shields.io/github/release/marian-nmt/marian.svg?label=Release
:target: https://github.com/marian-nmt/marian/releases
:alt: Latest release
.. |license| image:: https://img.shields.io/badge/License-MIT-blue.svg
:target: ../LICENSE.md
:alt: License: MIT

35
doc/make.bat Normal file
View File

@ -0,0 +1,35 @@
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=source
set BUILDDIR=build
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
:end
popd

553
doc/operators.md Normal file
View File

@ -0,0 +1,553 @@
# Operations in the Expression Graph
Operations are responsible for manipulating the elements of an expression graph.
In Marian, many useful operations have already been implemented and can be found
the code documentation. The provided operations cover simple arithmetic, logical
comparisons and common mathematical functions; as well as tensor manipulation,
for example `slice` or `reshape`, and aggregations such as `sum` or `minimum`.
Finally, other routines, such as activation functions, useful in building
neutral networks are also available.
There are several necessary components required to implement an operation in
Marian's expression graph. The highest-level component is the Expression
Operator, responsible for setting up the Node Operator and adding it to the
graph. Next, this Node Operator describes the nature of the forward and backward
operation to be performed. These operations are implemented using some
combination of Functional Operators (element wise), and Tensor Operators.
This overview aims to provide information about what each of the different
operator components does, how they fit together and where to go to make changes.
Then, equipped with this knowledge, to be able to add new functionality to
Marian.
## Operator Structure
The central component in the graph is the `Chainable<Tensor>` object. This
object provides the abstract interface necessary to interact with elements in
the computation graph. The details of this interface can be found in
[/src/graph/chainable.h](api/file_src_graph_chainable.h.html). Note that the
template parameter corresponds to the underlying data structure, which in Marian
is the `Tensor`. Therefore, for convenience, the type `Expr` is defined:
```cpp
typedef IPtr<Chainable<Tensor>> Expr;
```
The implementation of the different operator components are divided across
several files:
- Expression Operator
- [/src/graph/expression_operators.h](api/file_src_graph_expression_operators.h.html)
- [/src/graph/expression_operators.cpp](api/file_src_graph_expression_operators.cpp.html)
- Node Operator
- [/src/graph/node_operators_unary.h](api/file_src_graph_node_operators_unary.h.html)
- [/src/graph/node_operators_binary.h](api/file_src_graph_node_operators_binary.h.html)
- [/src/graph/node_operators_tuple.h](api/file_src_graph_node_operators_tuple.h.html)
- Functional Operator
- [/src/functional/operators.h](api/file_src_functional_operators.h.html)
- Tensor operation
- [/src/tensors/tensor_operators.h](api/file_src_tensors_tensor_operators.h.html)
- [/src/tensors/cpu/tensor_operators.cpp](api/file_src_tensors_cpu_tensor_operators.cpp.html)
- [/src/tensors/gpu/tensor_operators.cu](api/file_src_tensors_gpu_tensor_operators.cu.html)
- Declared Specialization
- [/src/tensors/gpu/element.inc](api/program_listing_file_src_tensors_gpu_element.inc.html)
- [/src/tensors/gpu/add.inc](api/program_listing_file_src_tensors_gpu_add.inc.html)
- [/src/tensors/gpu/add_all.inc](api/program_listing_file_src_tensors_gpu_add_all.inc.html)
To understand how the different components are inter-linked, we'll look at each
of them in turn.
## Expression Operator
The expression operator is the user-facing method used when building a graph. It
is responsible for constructing the corresponding Node Operation and inserting
it into the expression graph. To accommodate these core requirements, the
function `Expression` is able to perform both actions in generality:
```cpp
template <class T, typename... Args>
Expr Expression(Args&&... args) {
auto e = Expr(new T(std::forward<Args>(args)...));
return e->graph()->add(e);
}
```
This helper-function simplifies the definition of many expression operators. For
example, the implementation of the expression operator `sin(x)` is simply:
```cpp
// src/graph/expression_operators.h
Expr sin(Expr x);
// src/graph/expression_operators.cpp
Expr sin(Expr x) {
return Expression<SinNodeOp>(x);
}
```
However, implementations may perform actions beyond the core functionality
alone. Taking `sum` as an example
```cpp
Expr sum(Expr a, int ax) {
if(a->shape()[ax] == 1) {
return a;
}
return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::sum);
}
```
The trivial operation is handled without needing to construct a node operation.
This example also demonstrates a non-trivial construction of `ReduceNodeOp`,
which is capable of performing differing reduction operations depending on
instantiation.
Going further, an expression operator may be defined in terms of existing
expressions. Operators such as `weighted_average` are composed of three
different expression operator calls: `scalar_product`, `sum`, and `operator/`.
```cpp
Expr weighted_average(Expr in, Expr weights, int ax) {
auto p = scalar_product(in, weights, ax);
auto s = sum(weights, ax);
return p / s;
}
```
While useful, composition at this level may be less efficient than lower-level
implementations.
## Node Operator
The `Node` subclass of `Chainable<Tensor>` provides concrete implementations for
much of the abstract interface, while subclasses of `Node` enable different node
behaviours. In the context of operations, the relevant derived class is
`NaryNodeOp` and is base class used for Node Operators. This subclass provides
implementation focused on performing general N-arity operations. However, many
common operations are unary and, for convenience, a further specialization,
`UnaryNodeOp`, exists to simplify their definition.
The purpose of the Node Operator is to define the forward and backward behaviour
of the operation. The forward operation performs the desired operation while the
backward operation updates the gradients. These behaviours are written in terms
of `NodeOps`, where a `NodeOp` is a wrapper to define a capturing lambda
function. Explicitly these are defined as:
```cpp
// src/graph/chainable.h
#define NodeOp(op) [=]() { op; }
typedef std::vector<std::function<void()>> NodeOps;
```
Each `NodeOp` is written as a function in terms of the value (`val_`), gradient
(`adj_`) of the current node, and its children, via `child()`. The values and
gradients the n<sup>th</sup> child node are accessed via the interfaces
`child(n)->val()` and `child(n)->grad()`, respectively. NodeOps are executed in
order when running the graph forwards and backwards, as this snippet from `Node`
demonstrates
```cpp
// Node in src/graph/node.h
virtual void runForward(const NodeOps& ops) {
for(auto&& op : ops)
op();
}
virtual void runBackward(const NodeOps& ops) {
size_t i = 0;
for(auto&& op : ops)
if(child(i++)->trainable())
op();
}
```
In backwards operation it is **crucial** that the `NopeOp` responsible for
propagating a gradient to `child(i)` is the i<sup>th</sup> element of the
NodeOps vector. The requirement that the child associated with the NodeOp be
trainable means that an out-of-position NodeOp may not be run. To represent no
operation a `nullptr` can be passed as a NodeOp.
A typical node operator has the functionality demonstrated in the following
snippet.
```cpp
// outline of a node op
struct MyNodeOp : public NaryNodeOp {
MyNodeOp(Expr a)
: NaryNodeOp({a}, newShape(...), newType(...)) {}
Shape newShape(...) {} // optional
Type newType(...) {} // optional
const std::string type() override { return "my_node_op"; }
virtual size_t hash() override {} // potentially required
virtual bool equal(Expr node) override {} // potentially required
NodeOps forwardOps() override {}
NodeOps backwardOps() override {}
```
This outline describes a node operator that takes a single argument `a`. The
shape and type of the node would be determined by the result of `newShape` and
`newType` when constructing the `NaryNodeOp`. These functions represent any
custom logic used to determine the shape and type of the node. As indicated in
this example code, these are optional and, when omitted, calling
`NaryNodeOp({a})` would result in a node with the same shape and type as `a`.
The `type()` method returns the friendly name for the node. Note that the
[ONNX](https://onnx.ai)
[interface](api/program_listing_file_src_onnx_expression_graph_onnx_serialization.cpp.html)
maintains a mapping of these friendly names to their ONNX representation. In the
absence of any member variables the `hash()` and `equal()` methods can be
omitted, and defer to their `NaryNodeOp` definition. However, if such variables
exist then `hash()` should implement a hashed representation and `equal()`
should provide the necessary conditions to consider nodes equivalent. Finally,
the operations of the node are defined in `forwardOps()` and `backwardOps()`.
Continuing with the example of `sin(x)`, the code responsible for implementing
the behaviour is
```cpp
// src/graph/node_operators_unary.h
struct SinNodeOp : public UnaryNodeOp {
SinNodeOp(Expr x) : UnaryNodeOp(x) {}
NodeOps forwardOps() override {
using namespace functional;
return {NodeOp(Element(_1 = sin(_2), val_, child(0)->val()))};
}
NodeOps backwardOps() override {
using namespace functional;
return {NodeOp(Add(_1 * cos(_2), child(0)->grad(), adj_, child(0)->val()))};
}
const std::string type() override { return "sin"; }
};
```
In this code, the constructor trivially initialises the `UnaryNodeOp`, passing
the expression `x` as its input. This propagates up to `NaryNodeOp` and becomes
`child(0)` of the node. The size and type of the SinNodeOp are equivalent to
that of `x`. The lack of any member variables allows the `hash()` and `equal()`
methods to be omitted. The friendly name for this node is the string `sin`. The
forward and backward implementation are accomplished using a single NodeOp each.
### Forward operation
The forward NodeOp calls the tensor operation Element, that execute the
element-wise operation described by the functor:
```cpp
_1 = sin(_2)
```
The placeholders `_1`, `_2` are enabled by code in
[/src/functional](api/dir_src_functional.html) and interoperate with the
functional operators. In the call to `Element`, `val_` is assigned to `_1` and
`child(0)->val()` to `_2`. Therefore, this has the action of setting the
elements of this node to the result obtained by applying `sin` to the elements
of `child(0)`.
### Backward Operation
The backward NodeOp is responsible for backpropagation of the gradients via
reverse-mode automatic differentiation. In this example, where `y = sin(x)`,
this corresponds to evaluating
```
dJ/dx += dJ/dy * dy/dx, dy/dx = cos(x)
```
This is realised using the tensor operator `Add` with the functor
```cpp
_1 * cos(_2)
```
In the call to `Add`, `adj_` is assigned to `_1` and `child(0)->val()` to `_2`.
Therefore, this functor represents `dJ/dy * dy/dx`: the product of the gradient
at the current node and the gradient of the operation. This value is then added
to the gradient of the child `child(0)->grad()` as required.
### Shape and Type Changes
The `newShape` and `newType` methods are just a suggestion of how custom logic
may be encapsulated where needed. However, in practice, many operations do not
require a change in shape or type. In these instances, the node inherits the
broadcasted shape of its children as well as their common type. An important
feature of the type deduction in `NaryNodeOp::commonType()` is that it
guarantees that all child nodes are of the same type.
There are few operations in Marian that require a type specification. Where they
do exist, they are often simple as the desired type is explicitly provided, or
is trivially deduced. An example of this is `CastNodeOp`
```cpp
// CastNodeOp in src/graph/node_operators_unary.h
CastNodeOp(Expr a, Type type) : UnaryNodeOp(a, type) {}
```
The desired type is set explicitly in construction. A slightly different example
is that of `CSRDotNodeOp`. It has several child nodes which are a mixture of
`DataType` and `IndexType` and therefore do not share a common type. The
solution is to explicitly specify the relevant children to
`NaryNodeOp::commonType({...})`.
Shape modifying operations are more common. A simple example is the class of
operations performed by `ReduceNodeOp` which involve an aggregation process
along one axis of the Tensor. The output shape is determined by
```cpp
// ReduceNodeOp in src/graph/node_operators_unary.h
Shape newShape(Expr a, int axis) {
Shape shape = a->shape();
axis_ = shape.axis(axis);
shape.set(axis_, 1);
return shape;
}
```
The output shape is the same as the input but with the processed axis is reduced
to a single element. Other use cases include transpose and slicing operations,
as well as tensor products.
## Functional Operator
As the NodeOp are evaluated, they encounter the underlying datatype of the
`Tensor`. At this stage, type-specific intrinsic functions are required. These
intrinsics are implemented in the templated struct `Ops<ElementType>`, with a
specialization required for each type. The current required types are:
- float
- double
- float32x4 (see `src/3rd_party/sse_mathfun.h`)
- float32x8 (see `src/3rd_party/avx_mathfun.h`)
- half (see `cuda_fp16.h` in the CUDA Math API)
Further details are available in
[/src/common/types.h](api/file_src_common_types.h.html).
Returning to the example of `sin(x)`, the specialization for `float` and
`double` requires
```cpp
// src/functional/operators.h
// in namespace marian::functional
template <typename T>
struct Ops {
static HOST_DEVICE_INLINE T sin(const T&) { ABORT("Unknown type"); }
};
// Specialization for float
template <>
struct Ops<float> {
static HOST_DEVICE_INLINE float sin(const float& x) { return sinf(x); }
};
// Specialization for double
template <>
struct Ops<double> {
static HOST_DEVICE_INLINE double sin(const double& x) { return std::sin(x); }
};
```
The remaining specializations can be seen in
[/src/functional/operators.h](api/file_src_functional_operators.h.html). Note
that the general template must produce a runtime abort.
The final component of the functional operator is to call the macro that enables
interoperability with the framework of
[/src/functional](api/dir_src_functional.html). For a unary operator, this is
the macro `UNARY`.
```cpp
UNARY(Sin, sin, Ops<ElementType>::sin(x));
```
where template parameter `ElementType` **must** be used. There are equivalent
macros for `BINARY` and `TERNARY` Ops.
## Tensor Operator
Tensor operations use less abstracted interfaces to interact with the Tensors,
often working with the Tensor data directly. They also rely on BLAS (Basic
Linear Algebra Subprograms) libraries to accelerate these operations. As well as
libraries containing device-specific optimisations. These libraries include:
- CPU
- CBLAS / OpenBLAS
- FBGEMM
- INTGEMM
- MKL
- GPU
- CUDA (cuBLAS)
An important subtlety is that while the CPU focused libraries use a row-major
representation, the cuBLAS library (GPU) instead uses a column-major
representation.
Furthermore, the OpenMPI and OpenMP libraries are employed for parallelisation.
While macros provided in
[/src/common/definitions.h](api/file_src_common_definitions.h.html) locally
enable faster floating-point math in supported compilers.
```cpp
MARIAN_FFAST_MATH_BEGIN
// ffmath code
MARIAN_FFAST_MATH_END
```
The usual caveats apply when enabling `fast_math`, and can be found in
[/src/common/definitions.h](api/file_src_common_definitions.h.html)
Tensor operators are declared in
[/src/tensors/tensor_operators.h](api/file_src_tensors_tensor_operators.h.html),
these are device-agnostic function that call the relevant device-specific
implementation. The CPU- and GPU-specific implementation are defined in `cpu`
namespace in [/src/tensors/cpu/](api/dir_src_tensors_cpu.html) and the `gpu`
namespace [/src/tensors/gpu/](api/dir_src_tensors_gpu.html). Therefore a typical
operator defers to an implementation in the device-specific namespace.
```cpp
void TensorOp(marian::Tensor out, marian::Tensor in) {
#ifdef CUDA_FOUND
if(out->getBackend()->getDeviceId().type == DeviceType::gpu)
gpu::TensorOp(out, in);
else
#endif
cpu::TensorOp(out, in);
}
```
When compiled with GPU support, this function dispatches a call to the
implementation that corresponds to the backend device type configured in the
graph (either GPU or CPU). Without GPU support, only the CPU implementation is
available.
Many operations are covered by three general tensor operators: `Element`,
`Aggregate` and `Prod`. The `Element` operator applies a function element-wise
across an arbitrary number of input tensors and stores the result in the output
tensor. The `Aggregate` operator also applies a function element-wise across its
inputs, but instead aggregates the results in the output via a given aggregation
function. A common aggregation function used is addition, which is the basis of
the `Add` and `Reduce` operators. Finally, `Prod` deals with products of
tensors. This operator performs a general matrix multiplication with the
underlying implementation relying on the libraries mentioned above.
Specialized operators exist to manipulation tensors beyond the cases covered
above; such as under transposition and concatenation. These operators may even
be expressed in terms of existing tensor operators.
Furthermore, for complicated multi-operation computations, performance gains and
memory improvements may be realised by implementing a tensor operator for that
specific purpose. An example of this is `softmax`, which could be implemented
using multiple expression operators (`exp`, `sum`), but is instead implemented
directly as a tensor operator. These optimized implementations may be device
specific.
## Declared Specialization
The operations performed in the forward and backward methods of NodeOp require
their GPU templates to be explicitly declared. When a new specialization is
introduced without being explicitly instantiated it will cause a link error on
compilation:
```
.../src/tensors/tensor_operators.h:41: undefined reference to `void marian::gpu::Element<marian::functional::Assign< ... > ( ... )'
```
To fix these undefined references, we must explicitly add the specialization to
the `.inc` files of [/src/tensors/gpu/](api/dir_src_tensors_gpu.html). Each
`.inc` file is included at the end of its corresponding `.cu` file, ensuring
that the specialization is compiled.
The undefined references should be added to the `.inc` file that corresponds to
the header file in which contains the declaration of the missing functions.
The file [element.inc](api/file_src_tensors_gpu_element.inc.html) contains the
specializations of the function defined in
[element.h](api/file_src_tensors_gpu_element.h.html):
```cpp
// src/tensors/gpu/element.h
template <class Functor, class... Tensors>
void Element(Functor functor, Tensor out, Tensors... tensors);
```
Similarly, [add.inc](api/file_src_tensors_gpu_add.inc.html) contains the
specializations for functions matching either of the two signatures in
[add.h](api/file_src_tensors_gpu_add.h.html):
```cpp
// src/tensors/gpu/add.h
template <class Functor, class... Tensors>
void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors);
template <class Functor, class AggFunctor, class... Tensors>
void Aggregate(Functor functor, float initAgg, AggFunctor aggFunctor, float scale, marian::Tensor out, Tensors... tensors);
```
Finally [add_all.inc](api/file_src_tensors_gpu_add_all.inc.html) contains the
specializations for [add_all.h](api/file_src_tensors_gpu_add_all.h.html), which
are several versions of:
```cpp
// src/tensors/gpu/add_all.h
template <typename T, typename AccType, class Functor, class AggFunctor>
void AggregateAll(Ptr<Allocator> allocator,
Functor functor,
AccType aggInit,
AggFunctor aggFunctor,
AccType scale,
Tensor out,
const Tensor in1);
```
However, for [add_all.h](api/file_src_tensors_gpu_add_all.h.html), there is an
additional type dependence in the first template parameter, which requires two
entries:
```cpp
marian::gpu::AggregateAll< float, ... >( ... );
marian::gpu::AggregateAll< __half, ... >( ... ); // for COMPILE_FP16
```
where the `__half` specialization is related to half-precision floats and should
be added to the `COMPILE_FP16` preprocessor block.
The simplest method to add the correct specialization is to take the compilation
error output and extract the needed signature. To extract the signature:
1. Replace up to, and including, "undefined reference to `" with "template"
2. Replace the final ' with a semi-colon
To conform with definitions in the codebase, we should replace
`IntrusivePtr<marian::TensorBase>` with its typedef `marian::Tensor`. Note that
as these files are included in `marian::gpu` namespace, and explicitly use
`marian::functional` namespace it is also possible to omit both of these
prefixes. Typically, the namespace prefix of the specialized function is removed
as well. Following these rules for the example of `SinNodeOp` results in the
following entries:
**element**
```cpp
template void Element<Assign<Var<1>, UnaryFunctor<elem::Sin, Assignee<2> > >, marian::Tensor >(Assign<Var<1>, UnaryFunctor<elem::Sin, Assignee<2> > >, marian::Tensor, marian::Tensor);
```
**add**
```cpp
template void Add<BinaryFunctor<elem::Mult,Assignee<1>,UnaryFunctor<elem::Cos,Assignee<2> > >,class marian::Tensor,class marian::Tensor >(BinaryFunctor<elem::Mult,Assignee<1>,UnaryFunctor<elem::Cos,Assignee<2> > >,float,class marian::Tensor,class marian::Tensor,class marian::Tensor);
```
**add_all**
```cpp
template void AggregateAll<float,float,BinaryFunctor<elem::Mult,Assignee<1>,UnaryFunctor<elem::Cos,Assignee<2> > >,BinaryFunctor<elem::Plus,Assignee<1>,Assignee<2> > >(std::shared_ptr<marian::Allocator>,BinaryFunctor<elem::Mult,Assignee<1>,UnaryFunctor<elem::Cos,Assignee<2> > >,float,BinaryFunctor<elem::Plus,Assignee<1>,Assignee<2> >,float,marian::Tensor,marian::Tensor,marian::Tensor);
#if COMPILE_FP16
template void AggregateAll<__half,float,BinaryFunctor<elem::Mult,Assignee<1>,UnaryFunctor<elem::Cos,Assignee<2> > >,BinaryFunctor<elem::Plus,Assignee<1>,Assignee<2> > >(std::shared_ptr<marian::Allocator>,BinaryFunctor<elem::Mult,Assignee<1>,UnaryFunctor<elem::Cos,Assignee<2> > >,float,BinaryFunctor<elem::Plus,Assignee<1>,Assignee<2> >,float,marian::Tensor,marian::Tensor,marian::Tensor);
#endif
```

6
doc/requirements.txt Normal file
View File

@ -0,0 +1,6 @@
sphinx==2.4.4
breathe==4.13.0
exhale
sphinx_rtd_theme
recommonmark
m2r

View File

@ -127,6 +127,7 @@ IPtr<T> INew(Ptr<T> p) {
return IPtr<T>(p);
}
/// enum class DeviceType: defines which device is used for computation
enum class DeviceType : size_t { gpu = 0, cpu = 1 };
struct DeviceId {

View File

@ -28,6 +28,14 @@ struct Slice // Python-like slice/index descriptor
};
typedef std::vector<Slice> Slices;
/**
* Shape class mainly defines the shape or dimensionality of the node.
* Basically, Shape is a wrapper of a std::vector. Its size is the number of
* dimension. E.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3.
* WHen the index is negative, the real index is size() + index.
* It implements most common functions demanded by operations, e.g., resize(),
* slice(), and broadcast().
*/
struct Shape {
private:
std::vector<int> shape_;

View File

@ -143,7 +143,7 @@ do { \
default: ABORT("Unknown type {}", type); \
} \
} while(0)
/// namespace marian
namespace marian {
// small struct to enable templating based on types use for packing
@ -290,36 +290,37 @@ constexpr inline size_t operator+(size_t val, TypeClass typeClass) {
}
// @TODO: rename to ElementType when things become stable, so it's easier to review
/// enum class Type: stores all supported data type in Marian
enum class Type : size_t {
int8 = TypeClass::signed_type + 1u,
int16 = TypeClass::signed_type + 2u,
int32 = TypeClass::signed_type + 4u,
int64 = TypeClass::signed_type + 8u,
int8 = TypeClass::signed_type + 1u, ///< int8 type
int16 = TypeClass::signed_type + 2u, ///< int16 type
int32 = TypeClass::signed_type + 4u, ///< int32 type
int64 = TypeClass::signed_type + 8u, ///< int64 type
uint8 = TypeClass::unsigned_type + 1u,
uint16 = TypeClass::unsigned_type + 2u,
uint32 = TypeClass::unsigned_type + 4u,
uint64 = TypeClass::unsigned_type + 8u,
uint8 = TypeClass::unsigned_type + 1u, ///< uint8 type
uint16 = TypeClass::unsigned_type + 2u, ///< uint16 type
uint32 = TypeClass::unsigned_type + 4u, ///< uint32 type
uint64 = TypeClass::unsigned_type + 8u, ///< uint64 type
float16 = TypeClass::float_type + 2u,
float32 = TypeClass::float_type + 4u,
float64 = TypeClass::float_type + 8u,
float16 = TypeClass::float_type + 2u, ///< float16 type
float32 = TypeClass::float_type + 4u, ///< float32 type
float64 = TypeClass::float_type + 8u, ///< float64 type
packed16 = TypeClass::packed_type + 2u, // special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint16) is meaningless.
packed8avx2 = TypeClass::packed_type + 1u + TypeClass::avx2_type, // special type for FBGEMM with AVX2, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
packed8avx512 = TypeClass::packed_type + 1u + TypeClass::avx512_type, // special type for FBGEMM with AVX512, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
packed16 = TypeClass::packed_type + 2u, ///< special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint16) is meaningless.
packed8avx2 = TypeClass::packed_type + 1u + TypeClass::avx2_type, ///< special type for FBGEMM with AVX2, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
packed8avx512 = TypeClass::packed_type + 1u + TypeClass::avx512_type, ///< special type for FBGEMM with AVX512, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
intgemm8 = TypeClass::intgemm_type + 1u, // Int8 quantized (not packed) matrices for intgemm
intgemm16 = TypeClass::intgemm_type + 2u, // Int16 quantized (not packed) matrices for intgemm
intgemm8 = TypeClass::intgemm_type + 1u, ///< Int8 quantized (not packed) matrices for intgemm
intgemm16 = TypeClass::intgemm_type + 2u, ///< Int16 quantized (not packed) matrices for intgemm
intgemm8ssse3 = TypeClass::intgemm_type + 1u + TypeClass::ssse3_type, ///< Int8 quantized and packed (ssse3) matrices for intgemm
intgemm8avx2 = TypeClass::intgemm_type + 1u + TypeClass::avx2_type, ///< Int8 quantized and packed (avx2) matrices for intgemm
intgemm8avx512 = TypeClass::intgemm_type + 1u + TypeClass::avx512_type, ///< Int8 quantized and packed (avx512) matrices for intgemm
intgemm8avx512vnni = TypeClass::intgemm_type + 1u + TypeClass::avx512_type + 4096u, ///< Int8 quantized and packed (avx512) matrices for intgemm. VNNI algorithm
intgemm8ssse3 = TypeClass::intgemm_type + 1u + TypeClass::ssse3_type, // Int8 quantized and packed (ssse3) matrices for intgemm
intgemm8avx2 = TypeClass::intgemm_type + 1u + TypeClass::avx2_type, // Int8 quantized and packed (avx2) matrices for intgemm
intgemm8avx512 = TypeClass::intgemm_type + 1u + TypeClass::avx512_type, // Int8 quantized and packed (avx512) matrices for intgemm
intgemm8avx512vnni = TypeClass::intgemm_type + 1u + TypeClass::avx512_type + 4096u, // Int8 quantized and packed (avx512) matrices for intgemm. VNNI algorithm
intgemm16sse2 = TypeClass::intgemm_type + 2u + TypeClass::sse2_type, // Int16 quantized and packed (sse2) matrices for intgemm
intgemm16avx2 = TypeClass::intgemm_type + 2u + TypeClass::avx2_type, // Int16 quantized and packed (avx2) matrices for intgemm
intgemm16avx512 = TypeClass::intgemm_type + 2u + TypeClass::avx512_type, // Int16 quantized and packed (avx512) matrices for intgemm
intgemm16sse2 = TypeClass::intgemm_type + 2u + TypeClass::sse2_type, ///< Int16 quantized and packed (sse2) matrices for intgemm
intgemm16avx2 = TypeClass::intgemm_type + 2u + TypeClass::avx2_type, ///< Int16 quantized and packed (avx2) matrices for intgemm
intgemm16avx512 = TypeClass::intgemm_type + 2u + TypeClass::avx512_type, ///< Int16 quantized and packed (avx512) matrices for intgemm
};
static inline size_t operator&(TypeClass typeClass, Type type) {

View File

@ -39,6 +39,12 @@ struct BinaryFunctor {
}
};
/**
* Macro to set up unary-functions from marian::functional::Ops.
* @param name name for the struct
* @param name2 callable typedef
* @param func function wrapped
*/
#define UNARY(name, name2, func) \
namespace elem { \
struct name { \
@ -55,6 +61,12 @@ struct BinaryFunctor {
} \
static inline name<Capture> name2(Capture x) { return name<Capture>(x); }
/**
* Macro to set up binary-functions from marian::functional::Ops.
* @param name name for the struct
* @param name2 callable typedef
* @param func function wrapped
*/
#define BINARY(name, name2, func) \
namespace elem { \
struct name { \
@ -95,6 +107,12 @@ struct TernaryFunctor {
}
};
/**
* Macro to set up ternary-functions from marian::functional::Ops.
* @param name name for the struct
* @param name2 callable typedef
* @param func function wrapped
*/
#define TERNARY(name, name2, func) \
namespace elem { \
struct name { \

View File

@ -30,7 +30,7 @@ Expr ExpressionGraph::add(Expr node) {
} else {
node->setId(count_++);
// record in foward graph
// record in forward graph
nodesForward_.push_back(node);
// record in backward graph if training, and keep track of roots
@ -143,6 +143,11 @@ void ExpressionGraph::forward(std::list<Expr>& forwardTape, bool finalPass) {
if(inferenceOnly_)
v->children().clear();
// If checkpointing is disabled, keep the memory for forward signals for all nodes.
// If checkpointing is enabled:
// (a) In the forward pass before the backward pass, free the memory for the nodes in the subtape to save memory.
// (b) In the forward calls during the backward pass, keep the memory in the current subtape to accelerate
// gradient computation.
if(checkpointing_ && !finalPass) {
auto subtape = v->getSubtape();
if(subtape) {
@ -171,12 +176,14 @@ void ExpressionGraph::backward(bool reset, float clipValue) {
ABORT("Aborting");
}
// allocates memory and initialises gradients for parameters
for(auto kvParams : paramsByElementType_) {
kvParams.second->allocateBackward();
if(reset)
kvParams.second->set_zero_adjoint();
}
// for top nodes: allocates memory and initialise gradients to 1
for(auto&& v : topNodes_)
v->init_dependent();
@ -186,13 +193,16 @@ void ExpressionGraph::backward(bool reset, float clipValue) {
bool firstNaN = true;
while(!nodesBackward_.empty()) {
auto v = nodesBackward_.back();
nodesBackward_.pop_back();
auto v = nodesBackward_.back(); // return the last element
nodesBackward_.pop_back(); // remove the last element
// for non-top nodes: allocates memory and initialises gradients to 0
for(auto&& child : v->children())
if(child->trainable() && child->type() != "param")
child->set_zero_adjoint();
// if using gradient checkpointing,
// recompute the forward pass from checkpoint to the root
if(checkpointing_ && v->getSubtape()) {
forward(*v->getSubtape(), /*finalPass=*/true);
}

View File

@ -16,9 +16,18 @@
namespace marian {
/**
* Create an expression node of any type, and pass all
* arguments to any available constructor.
* E.g., to create a ConstantNode uses `Expression<ConstantNode>(...)`.
*/
template <class T, typename... Args>
Expr Expression(Args&&... args);
/**
* The whole tensor set in the graph.
* Holds all tensor objects (memory and nodes) for a graph.
*/
class Tensors {
private:
Ptr<TensorAllocator> tensors_;
@ -27,8 +36,8 @@ private:
typedef std::unordered_map<size_t, std::vector<WExpr>> WeakMemory;
typedef std::unordered_map<size_t, std::vector<Expr>> Memory;
Ptr<WeakMemory> shortterm_;
Ptr<Memory> longterm_;
Ptr<WeakMemory> shortterm_; // holds all nodes for a graph
Ptr<Memory> longterm_; // holds memoized nodes
public:
Tensors(Ptr<Backend> backend)
@ -112,97 +121,145 @@ public:
typedef std::map<Type, Ptr<Parameters>> ElementTypeParamsMap; // keep it sorted, hence map not unordered map
/**
* Main implementation of a computation graph.
* Keeps a record of data (tensors) and all operations. Each operation in a computation graph is a Node.
* Each Node defines its forward and backward steps.
*/
class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
size_t count_{0};
size_t count_{0}; // counter for nodes in the graph; hold current node index
std::unordered_set<Expr> topNodes_; // current set of roots. In the end, all but one must have been consumed.
std::unordered_set<Expr> topNodes_; // current set of roots. In the end, all but one must have been consumed
protected: // (these are protected, not private, for ONNX exporting)
std::list<Expr> nodesForward_;
std::list<Expr> nodesBackward_;
std::list<Expr> nodesForward_; ///< contains all nodes used for forward()
std::list<Expr> nodesBackward_; ///< contains trainable nodes used for backward()
// Holds memory and expressions that correspond to temporary expressions.
// This gets cleared before a new graph is built.
/**
* A shared pointer to the tensor objects in the graph.
* Holds memory and nodes that corresponds to tensors in a graph.
* Since operations will result in new tensors, this attribute is used
* to allocate memory for new tensors during forward() and backward().
* This gets cleared before a new graph is built.
*/
Ptr<Tensors> tensors_;
private:
std::unordered_map<size_t, std::vector<Expr>> memoized_;
Type defaultElementType_{Type::float32}; // Type used for storing parameters, currently all parameters have to have the same type
Type defaultElementType_{Type::float32}; // Type used for storing parameters, currently all parameters have to have the same type
bool inferenceOnly_{false};
bool inferenceOnly_{false}; // a flag holds whether the graph is used for inference only
bool checkpointing_{false}; // use gradient checkpointing if true
bool checkpointing_{false}; // use gradient checkpointing if true
bool reloaded_{false};
bool reloaded_{false}; // a flag holds whether the graph is reloaded: reloaded is true if the graph loads parameters by load() function.
bool throwNaN_{false};
bool throwNaN_{false}; // a flag holds whether the graph throws a NaN exception
protected:
// Delete, copy and move constructors
ExpressionGraph(const ExpressionGraph&) = delete;
ExpressionGraph(ExpressionGraph&&) = delete;
// Holds memory and expressions that correspond to graph parameters
// Now we can have multiple types of parameters in a separate parameters object per value type.
// This is currently only accessible through private functions during loading, will abort during training
// when params() is called (e.g. optimizer) and there is more or other types than the default parameter type.
// Currently the only usecase is inference. Trying to access params() for non-default parameter type is going
// to abort. Inference does not need to access a whole set of parameters.
/**
* A map holds memory and nodes that corresponds to graph parameters.
* The key is Type and the mapped value is a set of parameter objects with corresponding type.
* Now we can have multiple types of parameters in a separate parameters object per value type.
* This is currently only accessible through private functions during loading, will abort during training
* when params() is called (e.g. optimizer) and there is more or other types than the default parameter type.
* Currently the only usecase is inference. Trying to access params() for non-default parameter type is going
* to abort. Inference does not need to access a whole set of parameters.
*/
ElementTypeParamsMap paramsByElementType_;
Ptr<Backend> backend_;
std::string namespace_;
Ptr<Backend> backend_; ///< a shared pointer to the backend for the graph
std::string namespace_; ///< a string defines the namespace of the graph. Each graph has its own unique namespace.
public:
/** @brief Constructs a new expression graph
*
* Constructor should be used as New<ExpressionGraph>()
*/
/** Constructs a new expression graph. Constructor should be used as New<ExpressionGraph>(). */
ExpressionGraph(bool inference = false);
/** Destructor. Clear everything related to the graph except memoized nodes. */
virtual ~ExpressionGraph() {
clear();
for(auto kvParams : paramsByElementType_)
kvParams.second->clear();
}
/**
* Set device options used to run the graph.
* @param deviceId a struct type which stores device no. (size_t)
* and device type (DeviceType::cpu or DeviceType::gpu)
* @param device a pointer to the device
*/
virtual void setDevice(DeviceId deviceId = {0, DeviceType::gpu},
Ptr<Device> device = nullptr);
/**
* Get device info for the graph.
* @return deviceId a struct type which stores device no. (size_t)
* and device type (DeviceType::cpu or DeviceType::gpu)
*/
DeviceId getDeviceId() { return backend_->getDeviceId(); }
/**
* Get backend pointer for the graph.
* @return Ptr<Backend> pointer to backend
*/
Ptr<Backend> getBackend() { return backend_; }
/** Set whether the graph is used for inference only */
void setInference(bool inference) { inferenceOnly_ = inference; }
/** Check whether the graph is used for inference only (true) or not */
bool isInference() { return inferenceOnly_; }
/**
* Set whether the graph uses gradient checkpointing.
* <a href="https://github.com/cybertronai/gradient-checkpointing">Gradient Checkpointing</a>
* works by trading compute for memory, which reruns a forward-pass segment for each checkpoint segment during backward.
*/
void setCheckpointing(bool checkpointing) { checkpointing_ = checkpointing; }
/** Check whether the graph uses gradient checkpointing or not */
bool isCheckpointing() { return checkpointing_; }
/**
* Set namespace (std::string) for the graph.
* Each graph has its own unique namespace, which is used to form the name of a parameter object.
*/
void switchParams(const std::string& newNamespace) {
namespace_ = newNamespace;
}
/**
* Copy all parameter objects from one graph to current graph.
* @param graph a pointer to a graph object
*/
virtual void copyParams(Ptr<ExpressionGraph> graph) {
for(auto p : *graph->params())
param(p->name(), p->shape(), inits::fromTensor(p->val()), p->value_type());
forward(); // this will allocate parameters, execute the intializers and therefore copy parameter values
forward(); // this will allocate parameters, execute the initializers and therefore copy parameter values
}
/**
* Preallocate workspace memory (MB) for the graph.
* Sets the size of the memory available for the forward and backward step of the training procedure.
* This does not include model size and optimizer parameters that are allocated outsize workspace.
*/
void reserveWorkspaceMB(size_t num) {
size_t bytes = num * 1024 * 1024 - 1;
tensors_->reserve(bytes);
}
/** Copy tensor objects from one graph to current graph */
void reuseWorkspace(Ptr<ExpressionGraph> graph) {
tensors_ = graph->tensors_;
}
/**
* @brief Performs backpropogation on this expression graph.
*
* Backpropogation is implemented by performing first the forward pass and
* Performs backpropagation on this expression graph.
* Backpropagation is implemented by performing first the forward pass and
* then the backward pass of algorithmic differentiation (AD) on the nodes of
* the graph.
*/
@ -211,6 +268,12 @@ public:
backward();
}
/**
* Perform one backpropagation process on the graph to test
* whether the graph workspace fits into a given workspace memory.
* This function is used for searching the maximum batch size
* that fits into given workspace memory.
*/
bool fits() {
try {
tensors_->throwAtReallocation(true);
@ -223,19 +286,50 @@ public:
return true;
}
/**
* Check whether the memory allocated for a tensor object contains a NaN or infinite value.
* @param t a Tensor object
* @param isNaN a bool type holds the result whether the tensor contains a NaN value (pass by reference)
* @param isInf a bool type holds the result whether the tensor contains a infinite value (pass by reference)
*/
void checkNaN(Tensor t, bool& isNaN, bool& isInf);
/**
* Perform the forward pass on the nodes of the graph.
* The forward pass refers to the calculation process.
* It traverses through all nodes from input layer to output layer.
*/
void forward() {
for(auto kvParams : paramsByElementType_)
kvParams.second->allocateForward();
forwardNext();
}
/**
* Perform the forward pass without memory allocation for parameters.
* Helper function for forward().
*/
void forwardNext();
/**
* Perform forward pass on a given nodes with finalPass flag.
* Helper function for forward() and backward().
* @param forwardTape a pointer to the nodes used for forward pass
* @param finalPass a bool type which controls whether nodes should be freed with gradient-checkpointing
*/
void forward(std::list<Expr>& forwardTape, bool finalPass);
/**
* Perform the backward pass on the trainable nodes of the graph.
* The back pass refers to the process of computing the output error.
* It traverses through all nodes from output layer to input layer.
*/
void backward(bool reset = true, float clipValue = 0.f);
/**
* Generate graph layout in Graphviz format for visualisation.
* @return a string presenting graph layout in Graphviz format (dot)
*/
std::string graphviz() {
std::stringstream ss;
ss << "digraph ExpressionGraph {" << std::endl;
@ -253,6 +347,10 @@ public:
return ss.str();
}
/**
* Write graph layout in Graphviz format to a file.
* @param filename a string type specifies filename that writes the graph layout
*/
void graphviz(const std::string& filename) {
std::ofstream dot(filename);
dot << graphviz();
@ -345,6 +443,18 @@ private:
}
public:
/**
* Construct a parameter node in the graph.
* @param pname a string type holds the name of the parameter node
* @param shape a struct type defines the shape of the parameter tensor
* e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
* @param init a pointer to a NodeInitializer object, e.g., inits::zeros()
* @param elementType a scoped enumerator (enum class) defines the element type, e.g., Type::float16
* @param fixed a bool type specifies whether the parameter object is fixed (not trainable) or not.
* The default value is false which means the parameter is trainable.
* @return a pointer to the parameter node
*/
Expr param(const std::string& pname,
const Shape& shape,
const Ptr<inits::NodeInitializer>& init,
@ -354,6 +464,17 @@ public:
return param(pname, shape, init, elementType, fixed, /*typeSpecified=*/true);
}
/**
* Construct a parameter node in the graph without a specified type, and
* the type is set to defaultElementType_.
* @param pname a string type holds the name of the parameter node
* @param shape a struct type defines the shape of the parameter tensor
* e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
* @param init a pointer to a NodeInitializer object, e.g., inits::zeros()
* @param fixed a bool type specifies whether the parameter object is fixed (not trainable) or not.
* The default value is false which means the parameter is trainable.
* @return a pointer to the parameter node
*/
Expr param(const std::string& pname,
const Shape& shape,
const Ptr<inits::NodeInitializer>& init,
@ -362,28 +483,59 @@ public:
return param(pname, shape, init, defaultElementType_, fixed, /*typeSpecified=*/false);
}
/**
* Construct a constant node in the graph without a specified type, and
* the type is set to defaultElementType_.
* @param shape a struct type defines the shape of the constant tensor
* e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
* @param init a pointer to a NodeInitializer object, e.g., inits::zeros()
* @param elementType a scoped enumerator (enum class) defines the element type, e.g., Type::float16
* @return a pointer to the constant node
*/
Expr constant(const Shape& shape,
const Ptr<inits::NodeInitializer>& init,
Type elementType) {
return Expression<ConstantNode>(shared_from_this(), shape, init, elementType);
}
/**
* Construct a constant node in the graph without a specified type, and
* the type is set to defaultElementType_.
* @param shape a struct type defines the shape of the constant tensor
* e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
* @param init a pointer to a NodeInitializer object, e.g., inits::zeros()
* @return a pointer to the constant node
*/
Expr constant(const Shape& shape,
const Ptr<inits::NodeInitializer>& init) {
return Expression<ConstantNode>(shared_from_this(), shape, init, defaultElementType_);
}
// @TODO: add version with iterators
// shortcut to turn vector of indices to integer tensor, to be used with operators
// like rows or select
/**
* Turn vector of indices to integer tensor.
* A shortcut version to turn vector of indices to integer tensor, to be used with operators
* like rows() or index_select()
* @param indicesVector a vector of IndexType (uint32_t) specifies the indexes
*/
Expr indices(const std::vector<IndexType>& indicesVector) {
return constant({(int)indicesVector.size()},
inits::fromVector(indicesVector),
Type::uint32);
}
// this version sets up the shape such that the indices are in a given axis
// Use this if you want to pass these indices to gather().
// indexee shape = (3, 2, 5, 2); axis = 1 -> resulting shape = (1, size of indicesVector, 1, 1)
/**
* Specify the indexes of elements to be taken from a tensor.
* This version sets up the shape such that the indices are in a given axis.
* Use this if you want to pass these indices to gather().
* E.g., indexee shape = (3, 2, 5, 2); axis = 1 -> resulting shape = (1, size of indicesVector, 1, 1):
* - The size of the resulting shape is the same as that of the indexee; here is 4.
* - The shape of the specified axis is equal to the size of given indicesVector.
* - The shapes of the rest axes are filled with 1.
* @param indicesVector a vector of IndexType (uint32_t) specifies the indexes
* @param indexee the source tensor that we want to select elements from
* @param axis specifies the axis that we want to collect along
*/
Expr indices(const std::vector<IndexType>& indicesVector, Expr indexee, int axis = -1) {
Shape shape;
shape.resize(indexee->shape().size());
@ -393,24 +545,70 @@ public:
Type::uint32);
}
/**
* Construct a constant node filled with `1`.
* @param shape a struct type defines the shape of the constant dataset
* e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
* @param elementType a scoped enumerator (enum class) defines the element type, e.g., Type::float16
*/
Expr ones(const Shape& shape, Type elementType) {
return constant(shape, inits::ones(), elementType);
}
/**
* Construct a constant node filled with `1` without a specified type,
* and the type is set to defaultElementType_.
* @param shape a struct type defines the shape of the constant dataset
* e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
*/
Expr ones(const Shape& shape) {
return constant(shape, inits::ones(), defaultElementType_);
}
/**
* Construct a constant node filled with `0`.
* @param shape a struct type defines the shape of the constant dataset
* e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
* @param elementType a scoped enumerator (enum class) defines the element type, e.g., Type::float16
*/
Expr zeros(const Shape& shape, Type elementType) {
return constant(shape, inits::zeros(), elementType);
}
/**
* Construct a constant node filled with `0` without a specified type,
* and the type is set to defaultElementType_.
* @param shape a struct type defines the shape of the constant dataset
* e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
*/
Expr zeros(const Shape& shape) {
return constant(shape, inits::zeros(), defaultElementType_);
}
// prob = dropProb, e.g. 0.1 means 90% of values are kept
/**
* Construct a dropout mask (a tensor of 0 and 1).
* @param dropProb a float type specifies the dropout probability.
* E.g., dropProb=0.1 means 90% of values are kept.
* @param shape a struct type defines the shape of the constant dataset
* e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
* @param elementType a scoped enumerator (enum class) defines the element type, e.g., Type::float16
*/
Expr dropoutMask(float dropProb, const Shape& shape, Type elementType);
/**
* Construct a dropout mask (a tensor of 0 and 1) without a specified type,
* and the type is set to defaultElementType_.
* @param dropProb a float type specifies the dropout probability.
* E.g., dropProb=0.1 means 90% of values are kept.
* @param shape a struct type defines the shape of the constant dataset
* e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
*/
Expr dropoutMask(float dropProb, const Shape& shape);
/**
* Get the parameter object by name.
* @param name a string specifies the name of the parameter object
*/
Expr get(std::string name) {
if(!namespace_.empty())
name = namespace_ + "::" + name;
@ -419,6 +617,11 @@ public:
return p;
}
/**
* Get the parameter object by name and type.
* @param name a string specifies the name of the parameter object
* @param elementType a scoped enumerator (enum class) defines the element type, e.g., Type::float16
*/
Expr get(std::string name, Type specifiedElementType) {
if(!namespace_.empty())
name = namespace_ + "::" + name;
@ -427,6 +630,10 @@ public:
return p;
}
/**
* Return the Parameters object related to the graph.
* The Parameters object holds the whole set of the parameter nodes.
*/
Ptr<Parameters>& params() {
// There are no parameter objects, that's weird.
ABORT_IF(paramsByElementType_.empty(), "No parameter object has been created");
@ -441,6 +648,10 @@ public:
return it->second;
}
/**
* Set default element type for the graph.
* The default value is used if some node type is not specified.
*/
void setDefaultElementType(Type defaultElementType) {
ABORT_IF(!paramsByElementType_.empty() && defaultElementType != defaultElementType_,
"Parameter objects already exist, cannot change default type from {} to {}",
@ -448,33 +659,58 @@ public:
defaultElementType_ = defaultElementType;
}
/**
* Get default element type for the graph.
*/
Type getDefaultElementType() { return defaultElementType_; }
/**
* Add a expression node to the graph.
* @param node a pointer to a expression node
*/
Expr add(Expr node);
/**
* Allocate memory for the forward pass of the given node.
* @param node a pointer to a expression node
*/
void allocateForward(Expr node) {
if(tensors_)
tensors_->allocateForward(node);
}
/**
* Allocate memory for the backward pass of the given node.
* @param node a pointer to a expression node
*/
void allocateBackward(Expr node) {
if(tensors_)
tensors_->allocateBackward(node);
}
/**
* Free the memory for a tensor object.
* @param tensor a reference to the tensor object
*/
void free(const Tensor& tensor) {
if(tensors_)
tensors_->free(tensor);
}
// Returns the memory allocator of the graph workspace, allocates row unstructured memory (but 256-byte aligned)
/**
* Returns the memory allocator of the graph workspace.
* Allocates raw unstructured memory (but 256-byte aligned).
*/
Ptr<Allocator> allocator() { return tensors_->getAllocator(); } // @TODO: rename this to getAllocator();
// Returns the tensor allocator of the graph workspace, different from above as proper tensor objects are allocated
/**
* Returns the tensor allocator of the graph workspace.
* Different from allocator() as proper tensor objects are allocated.
*/
Ptr<TensorAllocator> getTensorAllocator() { return tensors_->getTensorAllocator(); }
/** Clear everything apart from parameters and memoized nodes */
void clear() {
// clear everything apart from parameters and memoized nodes
count_ = 0;
nodesForward_.clear();
nodesBackward_.clear();
@ -484,13 +720,17 @@ public:
tensors_->clear();
}
/** Set the flag value whether the graph is reloaded (true) or not */
void setReloaded(bool reloaded) { reloaded_ = reloaded; }
/** Set the flag value whether the graph throws a NaN exception (true) or not */
void setThrowNaN(bool throwNaN) { throwNaN_ = throwNaN; }
/** Get the flag value whether the graph throws a NaN exception (true) or not */
bool getThrowNaN() { return throwNaN_; }
public:
// loading from array of io::Items
/** Load model (mainly parameter objects) from array of io::Items */
void load(std::vector<io::Item>& ioItems, bool markReloaded = true) {
setReloaded(false);
for(auto& item : ioItems) {
@ -509,18 +749,24 @@ public:
setReloaded(true);
}
/** Load model by filename */
void load(const std::string& name, bool markReloaded = true) {
LOG(info, "Loading model from {}", name);
auto items = io::loadItems(name);
load(items, markReloaded);
}
/** Load model from buffer (a file pointer) */
void load(const void* ptr, bool markReloaded = true) {
LOG(info, "Loading model from buffer at {}", ptr);
auto items = io::loadItems(ptr);
load(items, markReloaded);
}
/**
* Turn the model (given a file pointer) into a memory-mapped type
* by converting all the parameter object to memory-mapped version, i.e., MappedParameters.
*/
void mmap(const void* ptr, bool markReloaded = true) {
ABORT_IF(backend_->getDeviceId().type != DeviceType::cpu || !inferenceOnly_,
"Memory mapping only supported for CPU inference mode");
@ -543,7 +789,6 @@ public:
}
}
// pre-populate parameters by type
for(auto& item : items) {
auto it1 = paramsByElementType_.find(item.type);
@ -558,9 +803,19 @@ public:
}
public:
// convert all parameters into an array of io::Item elements, for saving
/**
* Convert all parameters into an array of io::Item elements, for saving.
* @param ioItems an array of io::Item elements
* @param saveElementType the element type for saving
*/
void save(std::vector<io::Item>& ioItems, Type saveElementType = Type::float32);
/**
* Save all parameters into a file (.npz or .bin).
* @param name a string specifies the filename
* @param meta a string specifies the name of io::Item elements. If not specified, the parameter name is reserved.
* @param saveElementType the element type for saving
*/
void save(const std::string& name, const std::string& meta = "", Type saveElementType = Type::float32) {
std::vector<io::Item> ioItems;
save(ioItems, saveElementType);

View File

@ -72,6 +72,14 @@ Expr sin(Expr a) {
return Expression<SinNodeOp>(a);
};
Expr cos(Expr a) {
return Expression<CosNodeOp>(a);
};
Expr tan(Expr a) {
return Expression<TanNodeOp>(a);
};
Expr swish(Expr a) {
return Expression<SwishNodeOp>(a);
}

File diff suppressed because it is too large Load Diff

View File

@ -27,11 +27,6 @@ void Node::free() {
}
}
/**
* Initialization for backward step of top node
* in computation graph. Allocates memory and sets gradient
* to 1 (df/df == 1).
*/
void Node::init_dependent() {
if(!adj_) {
graph()->allocateBackward(this);
@ -39,12 +34,6 @@ void Node::init_dependent() {
}
}
/**
* Initialization for backward step of any non-top node
* in computation graph. Allocates memory and sets gradient
* to 0 for further accumulation of gradients from all
* parents.
*/
void Node::set_zero_adjoint() {
if(!adj_) {
graph()->allocateBackward(this);

View File

@ -28,13 +28,13 @@ protected:
std::vector<Expr> children_;
Weak<ExpressionGraph> graph_;
Shape shape_{1, 1, 1, 1};
Type valueType_{Type::float32};
Shape shape_{1, 1, 1, 1}; // defines the dimensionality of the node (for tensors)
Type valueType_{Type::float32}; // defines the element type of the node (for tensors)
std::string name_{"none"};
Tensor val_{nullptr};
Tensor adj_{nullptr};
Tensor val_{nullptr}; // the resulting new tensor in forward pass
Tensor adj_{nullptr}; // the accumulated gradients (a tensor) in backward pass
bool markedForDebug_{false};
std::string debugMessage_;
@ -105,9 +105,19 @@ public:
virtual void free() override;
virtual void init() override {};
/**
* Initialization for backward step of top node
* in computation graph. Allocates memory and sets gradient
* to 1 (df/df == 1).
*/
virtual void init_dependent() override;
/**
* Initialization for backward step of any non-top node
* in computation graph. Allocates memory and sets gradient
* to 0 for further accumulation of gradients from all
* parents.
*/
virtual void set_zero_adjoint() override;
virtual Tensor& val() override { return val_; };

View File

@ -98,9 +98,10 @@ Ptr<NodeInitializer> glorotUniform(bool fanIn, bool fanOut, float scalingFactor)
return fromLambda([fanIn, fanOut, scalingFactor](Tensor t) {
float scale = sqrtf(6.0f / (t->shape()[-2] + t->shape()[-1]));
if(fanIn && !fanOut)
scale = sqrtf(3.0f / t->shape()[-2]); // results in columns of matrix to be ~unit length
scale = sqrtf(3.0f / t->shape()[-2]); // fanIn mode: the scale of tensor is adapted by the input variance
// results in columns of matrix to be ~unit range
if(!fanIn && fanOut)
scale = sqrtf(3.0f / t->shape()[-1]);
scale = sqrtf(3.0f / t->shape()[-1]); // fanOut mode: the scale of tensor is adapted by the output variance
scale *= scalingFactor;
@ -112,9 +113,9 @@ Ptr<NodeInitializer> glorotNormal(bool fanIn, bool fanOut, float scalingFactor)
return fromLambda([fanIn, fanOut, scalingFactor](Tensor t) {
float scale = sqrtf(2.0f / (t->shape()[-2] + t->shape()[-1]));
if(fanIn && !fanOut)
scale = sqrtf(1.0f / t->shape()[-2]);
scale = sqrtf(1.0f / t->shape()[-2]); // fanIn mode: the scale of tensor is adapted by the input variance
if(!fanIn && fanOut)
scale = sqrtf(1.0f / t->shape()[-1]);
scale = sqrtf(1.0f / t->shape()[-1]); // fanOut mode: the scale of tensor is adapted by the output variance
scale *= scalingFactor;
@ -170,7 +171,7 @@ Ptr<NodeInitializer> fromWord2vec(const std::string& file,
bool normalize /*= false*/) {
return fromLambda([file, dimVoc, dimEmb, normalize](Tensor t) {
auto embs = Word2VecReader().read(file, dimVoc, dimEmb);
if(normalize) {
if(normalize) { // scaling to unit length:
float norm = 0;
for(auto e : embs)
norm += e * e;

View File

@ -11,17 +11,18 @@
namespace marian {
class ExpressionGraph; // Forward declaration
/**
* The namespace inits.
* Declare class NodeInitializer and all the available functions to initialise a node.
*/
namespace inits {
/**
* Base class for specialized NodeInitializers.
*
* A NodeInitializer is a functor that is associated with parameters
* and constants, and is invoked on a tensor during node intialization.
* You need to override NodeIntializer::apply(Tensor) with your own
* functionality or use a fromLambda intializer.
*
* and constants, and is invoked on a tensor during node initialization.
* You need to override NodeInitializer::apply(Tensor) with your own
* functionality or use a fromLambda initializer.
* See node_initializers.cpp for examples.
*/
class NodeInitializer {
@ -35,155 +36,242 @@ public:
};
/**
* Use a lambda function of form [](Tensor t) { do something with t } to initalize tensor
* Use a lambda function of form [](Tensor t) { do something with t } to initialize tensor.
* @param func functor
*/
Ptr<NodeInitializer> fromLambda(std::function<void(Tensor)>&& func);
/**
* Use a lambda function of form [](Tensor t) { do something with t } to initalize tensor
* Create temporary tensor of Type intermediateType first, initialize and then copy/convert to actual Tensor
* Useful for functions that can only operate on a specific type of tensor
* Use a lambda function of form [](Tensor t) { do something with t } to initialize tensor.
* Create temporary tensor of Type intermediateType first, initialize and then copy/convert to actual Tensor.
* Useful for functions that can only operate on a specific type of tensor.
*/
Ptr<NodeInitializer> fromLambda(std::function<void(Tensor)>&& func, Type intermediateType);
/**
* Initialize tensor with given value
*
* Creates a NodeInitializer that will intialize the given tensor
* Initialize tensor with given value.
* Creates a NodeInitializer that will initialize the given tensor
* with `value`. Works with any underlying numeric tensor type.
*
* @return A NodeInitializer
*/
Ptr<NodeInitializer> fromValue(float value);
/**
* Fill tensor with `0`
*
* Creates a NodeInitializer that will intialize the given tensor
* Fill tensor with `0`.
* Creates a NodeInitializer that will initialize the given tensor
* with `0`. Works with any underlying numeric tensor type.
*
* @return A NodeInitializer
*/
static Ptr<NodeInitializer> zeros() { return fromValue(0.0f); }
/**
* Fill tensor with `1`
*
* Creates a NodeInitializer that will intialize the given tensor
* Fill tensor with `1`.
* Creates a NodeInitializer that will initialize the given tensor
* with `1`. Works with any underlying numeric tensor type.
*
* @return A NodeInitializer
*/
static Ptr<NodeInitializer> ones() { return fromValue(1.0f); }
/**
* Set diagonal of two dimensional quadratic matrix to `value`.
*
* Sets all values of the tensor to 0 and intializes the diagonal with
* Sets all values of the tensor to 0 and initializes the diagonal with
* the given `value`. If no value is specified `1` is used by default.
*
* @return A NodeInitializer
*/
Ptr<NodeInitializer> eye(float value = 1.f);
/**
* Intialize tensor with normally distributed random numbers
*
* Be default this generates floating point numbers from the
* Initialize tensor with normally distributed random numbers.
* By default this generates floating point numbers from the
* normal distribution Normal(0, 1) unless specified differently.
*
* If compiled with `CUDA`, `marian` will use the `cuRand` library
* for both, GPU and CPU computation. The random sequences generated
* are the same on both devices.
*
* If `marian` is compiled without `CUDA`, a random generator
* from the C++ standard library is used. These random generators
* do not have the same random sequences.
*
* @return A NodeInitializer
*/
Ptr<NodeInitializer> normal(float mean = 0.f, float stddev = 1.f);
/**
* Intialize tensor with uniformly distributed random numbers
*
* Be default this generates floating point numbers from the
* Initialize tensor with uniformly distributed random numbers.
* By default this generates floating point numbers from the
* uniform distribution Uniform(0, 1) unless specified differently.
*
* If compiled with `CUDA`, `marian` will use the `cuRand` library
* for both, GPU and CPU computation. The random sequences generated
* are the same on both devices.
*
* If `marian` is compiled without `CUDA`, a random generator
* from the C++ standard library is used. These random generators
* do not have the same random sequences.
*
* @param a the lower bound of interval
* @param b the upper bound of interval
* @return A NodeInitializer
*/
Ptr<NodeInitializer> uniform(float a = 0.f, float b = 1.f);
// @TODO: add documentation
/**
* Initialize tensor with random numbers from Bernoulli Distribution.
* The Bernoulli distribution is the discrete probability distribution of
* a random variable which takes value `1` with probability p, and
* value `0` with probability (1-p).
* By default this function generates a tensor of 0 and 1 with probability p
* if bernoulli(p) is called. We offer `scale` and `shift` parameters which
* can map {0,1} to {0,1}*`scale`+`shift`.
* E.g., bernoulli(tensor, 0.5f, 2.f, -1.f) where p=0.5f, scale=2.f, shift=-1.f.
* {0,1} is mapped to {0,1}*2+(-1)= {-1,1}. It generates a tensor composed of
* 50% of 1 and 50% of -1.
* @return A NodeInitializer
*/
Ptr<NodeInitializer> bernoulli(float p, float scale = 1.f, float shift = 0.f);
// @TODO: add documentation
/**
* Initialize tensor with random numbers from Glorot uniform distribution.
* The <a href=http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>Glorot uniform</a>,
* also called Xavier uniform, is designed to keep the scale of
* the gradients roughly the same in all layers.
* This function offers three variants (modes).
* The values of the tensor is sampled from Uniform(-x*scale, x*scale):
* - when fanIn=false and fanOut=false (by default):
* x = sqrt(6 / (in + out))
* - when fanIn=true and fanOut=false (fanIn mode):
* x = sqrt(3 / in)
* - when fanIn=false and fanOut=false (fanOut mode):
* x = sqrt(3 / out)
* where `in` is the number of input units in the tensor, `out` is the number of output units.
* `scale` is used to change the range of Uniform distribution.
* @return A NodeInitializer
*/
Ptr<NodeInitializer> glorotUniform(bool fanIn = false, bool fanOut = false, float scale = 1.f);
// @TODO: add documentation
/**
* Initialize tensor with random numbers from Glorot Normal distribution.
* Similar to function glorotUniform(), this function adopts Normal distribution instead of
* uniform distribution.
* This function offers three variants (modes).
* The values of the tensor is sampled from Normal(-x*scale, x*scale):
* - when fanIn=false and fanOut=false (by default):
* x = sqrt(2 / (in + out))
* - when fanIn=true and fanOut=false (fanIn mode):
* x = sqrt(1 / in)
* - when fanIn=false and fanOut=false (fanOut mode):
* x = sqrt(1 / out)
* where `in` is the number of input units in the tensor, `out` is the number of output units.
* `scale` is used to change the range of Normal distribution.
* @return A NodeInitializer
*/
Ptr<NodeInitializer> glorotNormal(bool fanIn = false, bool fanOut = false, float scale = 1.f);
// @TODO: add documentation
Ptr<NodeInitializer> dropout(float dropoutProbabilty);
/**
* Initialize a dropout mask (a tensor of 0 and 1) with given dropout probability.
* <a href=https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf>Dropout</a>
* is proposed as a technique to prevent Neural Networks from overfitting.
* @param dropoutProbability a float type defines the dropout probability.
* E.g., dropoutProbability=0.1 means 90% of values are kept.
* @return A NodeInitializer
*/
Ptr<NodeInitializer> dropout(float dropoutProbability);
/**
* Intialize with gumbel noise, i.e. -log(-log(u)) where u ~ Uniform(0 + eps, 1 - eps)
*
* Initialize with gumbel noise, i.e. -log(-log(u)) where u ~ Uniform(0 + eps, 1 - eps).
* @param eps a variable protects from log(0)
* @return A NodeInitializer
*/
Ptr<NodeInitializer> gumbel(float eps = 1e-5f);
// @TODO: add documentation
/**
* Initialize tensor by *copying* from the given vector.
* Creates a NodeInitializer that will initialize the tensor
* by *copying* the values from the given vector
* @param v vector
* @return A NodeInitializer
*/
template <typename T>
Ptr<NodeInitializer> fromVector(const std::vector<T>& v);
/**
* Initialize tensor by *moving* from the given vector.
* Creates a NodeInitializer that will initialize the tensor by *moving* the values
* from the given vector into this tensor, and the given vector may be emptied.
* This version is the <a href=https://en.cppreference.com/w/cpp/language/reference>
* rvalue reference</a> overloading.
* @param v vector
* @return A NodeInitializer
*/
template <typename T>
Ptr<NodeInitializer> fromVector(std::vector<T>&& v);
// @TODO: add documentation
/**
* Initialize tensor from a given sparse vector.
* Creates a NodeInitializer that will initialize the tensor from a given
* sparse vector (stored in std::pair). The resulting tensor is first filled
* with `1e-6` (a placeholder for non-zero element), then set the value to
* the given sparse vector.
* @param v the sparse vector is stored in `std::pair`:
* - the first object (v.first) holds the indexes (in a vector)
* - the second object (v.second) holds the corresponding values (in a vector).
* This means the value of the resulting tensor at index v.first[i] is v.second[i].
* @return A NodeInitializer
*/
Ptr<NodeInitializer> fromSparseVector(std::pair<std::vector<size_t>, std::vector<float>>& v);
// @TODO: add documentation
/**
* Initialize tensor by copying from the given io::Item.
* Creates a NodeInitializer that will initialize the tensor by copying the values
* from the given io::Item. If this io::Item is a memory-mapped item, then the
* function will set the memory region pointing to this item. If this io::Item is
* a regular item, then the function will copy the values from this item.
* @return A NodeInitializer
*/
Ptr<NodeInitializer> fromItem(const io::Item& item);
// @TODO: add documentation
/**
* Initialize tensor by copying from the given tensor.
* Creates a NodeInitializer that will initialize the tensor
* by copying the values from the given tensor.
* @return A NodeInitializer
*/
Ptr<NodeInitializer> fromTensor(Tensor tensor);
// @TODO: add documentation
/**
* Initialize tensor from a file.
* Creates a NodeInitializer that will initialize the tensor
* by copying the values from the given file. This function is
* mainly used for loading embedding vectors from a file.
* @param file filename
* @param dimVoc the number of words in the vocabulary
* @param dimEmb the length of embedding vectors
* @param normalize a flag holds whether the values are normalize.
* Here we adopt the method of <a
* href=https://en.wikipedia.org/wiki/Feature_scaling#Scaling_to_unit_length>
* scaling to unit length</a>, i.e., dividing each element by the Euclidean length of the vector.
* @return A NodeInitializer
*/
Ptr<NodeInitializer> fromWord2vec(const std::string& file,
int dimVoc,
int dimEmb,
bool normalize = false);
/**
* Computes Google's sinusoidal position embeddings.
* Computes Google's Transformer-style sinusoidal position embeddings
* starting from position 'start' taking into account batch and time
* dimensions of the tensor.
*
* Expected tensor layout {-2: time, -1: model}
*
* Usually gets later reshaped to {time, 1, model} and
* added with a broadcast to learned embeddings. Positional
* embeddings are the same for each batch entry and change
* over time steps.
* dimensions of the tensor. Expected tensor layout {-2: time, -1: model}.
* Usually gets later reshaped to {time, 1, model} and added with a broadcast
* to learned embeddings. Positional embeddings are the same for each batch
* entry and change over time steps.
*/
Ptr<NodeInitializer> sinusoidalPositionEmbeddings(int start);
/**
* Computes a random rotation matrix for LSH hashing. This is part
* of a hash function. The values are orthonormal and computed via
* Computes a random rotation matrix for LSH hashing.
* This is part of a hash function. The values are orthonormal and computed via
* QR decomposition. Same seed results in same random rotation.
*/
Ptr<NodeInitializer> randomRotation(size_t seed = Config::seed);
/**
* Computes the equivalent of Python's range().
* Computes a range from begin to end-1, like Python's range().
* The constant being initialized must have one dimension that matches
* the number of elements being generated, while any other dimension must be 1.

View File

@ -5,7 +5,13 @@
#include "tensors/tensor.h"
namespace marian {
/**
* A constant node for the graph.
* A constant node is actually a constant tensor whose value is
* immutable during the training. ConstantNode instance is usually
* used as the inputs. To construct a constant node in the
* graph, we use constant() function in ExpressionGraph class.
*/
struct ConstantNode : public Node {
ConstantNode(Ptr<ExpressionGraph> graph,
const Shape& shape,
@ -35,7 +41,13 @@ private:
Ptr<inits::NodeInitializer> init_;
bool initialized_;
};
/**
* A parameter node for the graph.
* A parameter node is used to store model parameters whose value can be
* changed during the training, such as weights and biases. To construct
* a parameter node in the graph, we use param() function in
* ExpressionGraph class.
*/
struct ParamNode : public Node {
ParamNode(Ptr<ExpressionGraph> graph,
const Shape& shape,

View File

@ -646,7 +646,7 @@ struct CosNodeOp : public UnaryNodeOp {
return {NodeOp(Add(_1 * -sin(_2), child(0)->grad(), adj_, child(0)->val()))};
}
const std::string type() override { return "sin"; }
const std::string type() override { return "cos"; }
};
struct TanNodeOp : public UnaryNodeOp {
@ -662,7 +662,7 @@ struct TanNodeOp : public UnaryNodeOp {
return {NodeOp(Add(_1 / sqr(cos(_2)), child(0)->grad(), adj_, child(0)->val()))};
}
const std::string type() override { return "sin"; }
const std::string type() override { return "tan"; }
};
struct SqrtNodeOp : public UnaryNodeOp {

View File

@ -37,3 +37,5 @@ template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functio
template void marian::gpu::Add<marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase> >(marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
template void marian::gpu::Aggregate<marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::Assignee<1> >, marian::functional::BinaryFunctor<marian::functional::elem::Max, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, IntrusivePtr<marian::TensorBase> >(marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::Assignee<1> >, float, marian::functional::BinaryFunctor<marian::functional::elem::Max, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult,marian::functional::Assignee<1>,marian::functional::UnaryFunctor<marian::functional::elem::Cos,marian::functional::Assignee<2> > >,class IntrusivePtr<class marian::TensorBase>,class IntrusivePtr<class marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Mult,marian::functional::Assignee<1>,marian::functional::UnaryFunctor<marian::functional::elem::Cos,marian::functional::Assignee<2> > >,float,class IntrusivePtr<class marian::TensorBase>,class IntrusivePtr<class marian::TensorBase>,class IntrusivePtr<class marian::TensorBase>);
template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::UnaryFunctor<marian::functional::elem::Sin, marian::functional::Assignee<2> > > >, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::UnaryFunctor<marian::functional::elem::Sin, marian::functional::Assignee<2> > > >, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > > >, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > > >, float, marian::Tensor, marian::Tensor, marian::Tensor);

View File

@ -37,6 +37,9 @@ template void marian::AggregateAll<float, float, marian::functional::BinaryFunct
template void marian::AggregateAll<float, float, marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
template void marian::AggregateAll<float,float,marian::functional::BinaryFunctor<marian::functional::elem::Mult,marian::functional::Assignee<1>,marian::functional::UnaryFunctor<marian::functional::elem::Cos,marian::functional::Assignee<2> > >,marian::functional::BinaryFunctor<marian::functional::elem::Plus,marian::functional::Assignee<1>,marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>,marian::functional::BinaryFunctor<marian::functional::elem::Mult,marian::functional::Assignee<1>,marian::functional::UnaryFunctor<marian::functional::elem::Cos,marian::functional::Assignee<2> > >,float,marian::functional::BinaryFunctor<marian::functional::elem::Plus,marian::functional::Assignee<1>,marian::functional::Assignee<2> >,float,IntrusivePtr<marian::TensorBase>,IntrusivePtr<marian::TensorBase>,IntrusivePtr<marian::TensorBase>);
template void marian::AggregateAll<float, float, marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::Assignee<1> >, marian::functional::BinaryFunctor<marian::functional::elem::Max, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::Assignee<1> >, float, marian::functional::BinaryFunctor<marian::functional::elem::Max, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
template void marian::AggregateAll<float, float, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::UnaryFunctor<marian::functional::elem::Sin, marian::functional::Assignee<2> > > >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::UnaryFunctor<marian::functional::elem::Sin, marian::functional::Assignee<2> > > >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void marian::AggregateAll<float, float, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > > >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > > >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor);
#if COMPILE_FP16
template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Div, Capture, Assignee<1>>>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Div, Capture, Assignee<1>>>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
@ -75,4 +78,6 @@ template void marian::AggregateAll<__half, float, marian::functional::BinaryFunc
template void marian::AggregateAll<__half, float, marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
template void marian::AggregateAll<__half,float,marian::functional::BinaryFunctor<marian::functional::elem::Mult,marian::functional::Assignee<1>,marian::functional::UnaryFunctor<marian::functional::elem::Cos,marian::functional::Assignee<2> > >,marian::functional::BinaryFunctor<marian::functional::elem::Plus,marian::functional::Assignee<1>,marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>,marian::functional::BinaryFunctor<marian::functional::elem::Mult,marian::functional::Assignee<1>,marian::functional::UnaryFunctor<marian::functional::elem::Cos,marian::functional::Assignee<2> > >,float,marian::functional::BinaryFunctor<marian::functional::elem::Plus,marian::functional::Assignee<1>,marian::functional::Assignee<2> >,float,IntrusivePtr<marian::TensorBase>,IntrusivePtr<marian::TensorBase>,IntrusivePtr<marian::TensorBase>);
template void marian::AggregateAll<__half, float, marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::Assignee<1> >, marian::functional::BinaryFunctor<marian::functional::elem::Max, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::Assignee<1> >, float, marian::functional::BinaryFunctor<marian::functional::elem::Max, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::UnaryFunctor<marian::functional::elem::Sin, marian::functional::Assignee<2> > > >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::UnaryFunctor<marian::functional::elem::Sin, marian::functional::Assignee<2> > > >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > > >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > > >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor);
#endif

View File

@ -68,6 +68,8 @@ template void marian::gpu::Element<marian::functional::Assign<marian::functional
template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::UnaryFunctor<marian::functional::elem::Floor, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::Log, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<2>, marian::functional::Capture> >, marian::functional::Capture> >, marian::functional::UnaryFunctor<marian::functional::elem::Log, marian::functional::Capture> > > >, IntrusivePtr<marian::TensorBase> >(marian::functional::Assign<marian::functional::Var<1>, marian::functional::UnaryFunctor<marian::functional::elem::Floor, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::Log, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<2>, marian::functional::Capture> >, marian::functional::Capture> >, marian::functional::UnaryFunctor<marian::functional::elem::Log, marian::functional::Capture> > > >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Pow, marian::functional::Capture, marian::functional::Assignee<1> >, marian::functional::Capture>, marian::functional::UnaryFunctor<marian::functional::elem::Sgn, marian::functional::Assignee<2> > > >, IntrusivePtr<marian::TensorBase> >(marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Pow, marian::functional::Capture, marian::functional::Assignee<1> >, marian::functional::Capture>, marian::functional::UnaryFunctor<marian::functional::elem::Sgn, marian::functional::Assignee<2> > > >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Sgn, marian::functional::Assignee<1> >, marian::functional::Capture>, marian::functional::BinaryFunctor<marian::functional::elem::Pow, marian::functional::Capture, marian::functional::BinaryFunctor<marian::functional::elem::Clip, marian::functional::UnaryFunctor<marian::functional::elem::Floor, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::Log, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::Capture> >, marian::functional::Capture> >, marian::functional::UnaryFunctor<marian::functional::elem::Log, marian::functional::Capture> > >, marian::functional::Capture> > > >>(marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Sgn, marian::functional::Assignee<1> >, marian::functional::Capture>, marian::functional::BinaryFunctor<marian::functional::elem::Pow, marian::functional::Capture, marian::functional::BinaryFunctor<marian::functional::elem::Clip, marian::functional::UnaryFunctor<marian::functional::elem::Floor, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::Log, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::Capture> >, marian::functional::Capture> >, marian::functional::UnaryFunctor<marian::functional::elem::Log, marian::functional::Capture> > >, marian::functional::Capture> > > >, IntrusivePtr<marian::TensorBase>);
template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > >, marian::Tensor >(marian::functional::Assign<marian::functional::Var<1>, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > >, marian::Tensor, marian::Tensor);
template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::UnaryFunctor<marian::functional::elem::Tan, marian::functional::Assignee<2> > >, marian::Tensor >(marian::functional::Assign<marian::functional::Var<1>, marian::functional::UnaryFunctor<marian::functional::elem::Tan, marian::functional::Assignee<2> > >, marian::Tensor, marian::Tensor);
// How to add new specializations:
// When you use a new specialization, it will cause a link error of this form (example):
// .../src/tensors/tensor_operators.h:41: undefined reference to `void marian::gpu::Element<marian::functional::Assign< ... > ( ... )'

View File

@ -21,6 +21,12 @@ namespace io {
struct Item;
}
/**
* Main implementation of a <a href="https://en.wikipedia.org/wiki/Tensor">tensor</a>,
* a multi-dimensional matrix containing elements of a single data type.
* TensorBase contains the data, data type, pointer to
* memory region, shape, backend info and other attributes.
*/
class TensorBase {
MemoryPiece::PtrType memory_;
Shape shape_;