merge with internal master

2024-11-03 20:13:47 +03:00 · 2021-03-02 05:15:41 +00:00 · 2021-03-02 05:15:41 +00:00 · 55a7047f8a
commit 55a7047f8a
parent 8ecc8b653f 8155d232db
39 changed files with 2599 additions and 236 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -20,12 +20,17 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Support for CUDA 11.
 - General improvements and fixes for MPI handling, was essentially non-functional before (syncing, random seeds, deadlocks during saving, validation etc.)
 - Allow to compile -DUSE_MPI=on with -DUSE_STATIC_LIBS=on although MPI gets still linked dynamically since it has so many dependencies.
+- Fix building server with Boost 1.75
+- Missing implementation for cos/tan expression operator

 ### Changed
 - Change compile options a la -DCOMPILE_CUDA_SM35 to -DCOMPILE_KEPLER, -DCOMPILE_MAXWELL,
 -DCOMPILE_PASCAL, -DCOMPILE_VOLTA, -DCOMPILE_TURING and -DCOMPILE_AMPERE
 - Disable -DCOMPILE_KEPLER, -DCOMPILE_MAXWELL by default.
 - Dropped support for legacy graph groups.
+- Developer documentation framework based on Sphinx+Doxygen+Breathe+Exhale
+- Expresion graph documentation (#788)
+- Graph operators documentation (#801)

 ## [1.10.0] - 2021-02-06

--- a/Doxyfile.in
+++ b/Doxyfile.in
@ -169,7 +169,7 @@ SHORT_NAMES            = NO
 # description.)
 # The default value is: NO.

-JAVADOC_AUTOBRIEF      = NO
+JAVADOC_AUTOBRIEF      = YES

 # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
 # line (until the first dot) of a Qt-style comment as the brief description. If
--- a/2
+++ b/2
@ -1 +1 @@
-v1.10.0
+v1.10.2
--- a/contrib/triton-aml/marian_backend/src/marian.cc
+++ b/contrib/triton-aml/marian_backend/src/marian.cc
@ -118,7 +118,6 @@ ModelState::SetMarianConfigPath()
    // Set the Marian config path.
    std::string config_path("/var/azureml-app/");
    config_path.append(std::getenv("AZUREML_MODEL_DIR"));
-    config_path.append("/nlxseq2seq/triton/nlxseq2seq/1/data/model/");
    config_path.append(config_filepath_str);
    marian_config_path_ = config_path;

@ -199,6 +198,16 @@ ModelInstanceState::ModelInstanceState(

 extern "C" {

+void
+handler(int sig) {
+    void* array[30];
+
+    size_t size = backtrace(array, 30);
+
+    fprintf(stderr, "Error: signal %d, Exception info:\n", sig);
+    backtrace_symbols_fd(array, size, STDERR_FILENO);
+}
+
 TRITONSERVER_Error*
 TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model)
 {
@ -209,6 +218,9 @@ TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model)
        TRITONBACKEND_ModelSetState(model, reinterpret_cast<void*>(model_state))
    );

+    signal(SIGSEGV, handler);
+    signal(SIGABRT, handler);
+
    return nullptr; // success
 }

@ -308,7 +320,6 @@ TRITONBACKEND_ModelInstanceExecute(

    std::vector<TRITONBACKEND_Input*> request_input;
    std::vector<int> request_batch_size;
-    std::vector<std::string> inputs;
    std::string input_strings;

    // Create a single response object for each request. If something
@ -389,14 +400,13 @@ TRITONBACKEND_ModelInstanceExecute(
            }
            content_buffer.insert(
                content_buffer.end(), reinterpret_cast<const char*>(input_buffer) + 4,
-                reinterpret_cast<const char*>(input_buffer) + buffer_byte_size - 4
+                reinterpret_cast<const char*>(input_buffer) + buffer_byte_size
            );
        }

        std::string s(content_buffer.begin(), content_buffer.end());
        int count = std::count(s.begin(), s.end(), '\n');
        request_batch_size.push_back(count + 1);
-        inputs.push_back(s);
        content_buffer.clear();

        if (input_strings.empty()) {
@ -433,12 +443,16 @@ TRITONBACKEND_ModelInstanceExecute(
            if (output_content == nullptr) {
                output_content = pos;
            } else {
-                strcat(output_content, "\n");
-                strcat(output_content, pos);
+                // Replace the null terminator of the prev sentence with new line char
+                *(pos - 1) = '\n';
            }
            // Move to next output content.
            if (p != nullptr) {
                pos = p + 1;
+            } else {
+                // Break if there no left output content, even though batch_size > 0,
+                // '\n' at the end may be processed by Marian.
+                break;
            }
            batch_size--;
        }
@ -567,4 +581,4 @@ TRITONBACKEND_ModelInstanceExecute(

 }  // extern "C"

-}}} // namespace triton::backend::marian
+}}} // namespace triton::backend::marian
--- a/contrib/triton-aml/marian_backend/src/marian.h
+++ b/contrib/triton-aml/marian_backend/src/marian.h
@ -1,4 +1,9 @@
 #pragma once
+#include <stdio.h>
+#include <execinfo.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <unistd.h>

 #ifdef _WIN32
    #define DLLEXPORT extern "C" __declspec(dllexport)
--- a/doc/.gitignore
+++ b/doc/.gitignore
@ -0,0 +1,4 @@
+api
+build
+doxygen
+venv
--- a/doc/Makefile
+++ b/doc/Makefile
@ -0,0 +1,23 @@
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: clean help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+# Clean target as recommended by Exhale
+# https://exhale.readthedocs.io/en/latest/usage.html#optional-create-a-proper-clean-target
+clean:
+	rm -rf doxygen/ api/
+	@$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/doc/README.md
+++ b/doc/README.md
@ -0,0 +1,51 @@
+# Marian NMT code documentation and library API
+
+This directory contains code documentation and library API for developers of Marian NMT.
+
+The documentation is generated using
+[Sphinx](https://www.sphinx-doc.org/en/master/usage/quickstart.html) +
+[Breathe](https://breathe.readthedocs.io/en/latest/directives.html) +
+[Doxygen](http://www.doxygen.nl/manual/docblocks.html) +
+[Exhale](https://exhale.readthedocs.io/en/latest/usage.html).
+The documentation source code is written in `.rst` or `.md` files with special directives that allow
+to reference to C++ source code and documentation. The source documents are then build into static
+HTML pages.
+
+
+## Installation
+
+On Ubuntu 20.04, install the following packages:
+
+    sudo apt-get install python3 python3-pip python3-setuptools doxygen
+
+Then set up a Python environment and install modules:
+
+    pip3 install virtualenv
+    virtualenv venv -p python3
+    source venv/bin/activate
+    pip install -r requirements.txt
+
+Documentation building should also work on Windows, but it has not been tested.
+
+
+## Generation
+
+The documentation can be generated by running:
+
+    make html
+
+The website will be generated into `build/html` and accessible by opening _index.html_ in your
+browser.
+
+Directories:
+
+- `build` - automatically output directory for HTML documentation
+- `doxygen` - automatically generated Doxygen XML files
+- `api` - automatic library API generated with Exhale
+- `.rst` and `.md` files in this directory and its subdirectories are documentation source files
+- `_static` - custom CSS and JavaScript files
+
+
+## Writing documentation
+
+To be documented...
--- a/doc/_static/css/custom.css
+++ b/doc/_static/css/custom.css
@ -0,0 +1,4 @@
+.wy-body-for-nav > .wy-grid-for-nav > .wy-nav-side {
+    border-bottom: 5px solid #28bbee;
+    /*background-color: #494d55;*/
+}
--- a/doc/conf.py
+++ b/doc/conf.py
@ -0,0 +1,120 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import datetime
+import sys
+
+sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'Marian NMT'
+copyright = '2021, Marian NMT Team'
+author = 'Marian NMT Team'
+
+# The full version, including alpha/beta/rc tags
+# TODO: add GitHub commit hash to the version
+version_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'VERSION')
+with open(os.path.abspath(version_file)) as f:
+    version = f.read().strip()
+release = version + ' ' + str(datetime.date.today())
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.imgmath',
+    'sphinx.ext.todo',
+    'breathe',
+    'exhale',
+    'recommonmark',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = [
+    'build',
+    'doxygen',
+    'venv',
+    'README.md',
+]
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+htmlhelp_basename = 'marian'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+html_css_files = ['css/custom.css']
+
+# The base URL which points to the root of the HTML documentation
+html_baseurl = 'http://marian-nmt.github.io/docs/api'
+
+
+# -- Extension configuration -------------------------------------------------
+
+breathe_projects = { 'marian': './doxygen/xml' }
+breathe_default_project = 'marian'
+
+doxygen_config = """
+INPUT                = ../src
+EXCLUDE             += ../src/3rd_party
+EXCLUDE             += ../src/tests
+EXCLUDE_PATTERNS     = *.md *.txt
+FILE_PATTERNS       += *.cu
+EXTENSION_MAPPING   += cu=C++ inc=C++
+ENABLE_PREPROCESSING = YES
+JAVADOC_AUTOBRIEF    = YES
+WARN_IF_UNDOCUMENTED = NO
+"""
+
+exhale_args = {
+    'containmentFolder'     : './api',
+    'rootFileName'          : 'library_index.rst',
+    'rootFileTitle'         : 'Library API',
+    'doxygenStripFromPath'  : '..',
+    'createTreeView'        : True,
+    'exhaleExecutesDoxygen' : True,
+    'exhaleDoxygenStdin'    : doxygen_config.strip(),
+}
+
+primary_domain = 'cpp'
+highlight_language = 'cpp'
+
+# A trick to include markdown files from outside the source directory using
+# 'mdinclude'. Warning: all other markdown files not included via 'mdinclude'
+# will be rendered using recommonmark as recommended by Sphinx
+from m2r import MdInclude
+
+def setup(app):
+    # from m2r to make `mdinclude` work
+    app.add_config_value('no_underscore_emphasis', False, 'env')
+    app.add_config_value('m2r_parse_relative_links', False, 'env')
+    app.add_config_value('m2r_anonymous_references', False, 'env')
+    app.add_config_value('m2r_disable_inline_math', False, 'env')
+    app.add_directive('mdinclude', MdInclude)
--- a/doc/contributing.rst
+++ b/doc/contributing.rst
@ -0,0 +1 @@
+.. mdinclude:: ../CONTRIBUTING.md
--- a/doc/graph.md
+++ b/doc/graph.md
@ -0,0 +1,406 @@
+# Expression graphs
+
+The design of the deep learning framework in Marian is based on reverse-mode [auto-differentiation](https://en.wikipedia.org/wiki/Automatic_differentiation) (also known as backpropagation) with dynamic computation graphs. 
+Computation graphs allow a great deal of freedom in network architectures, and they can deal with complicated structures like conditions and loops.
+The dynamic declaration, which means a new graph is created for each training instance (for a training example or a batch), is also advantageous.
+It allows handling of variably sized inputs, as well as the cases where the graph may change depending on the results of previous steps.
+Compared to static declaration, a dynamic computation graph could be expensive in terms of creating and optimising computation graphs.
+Marian uses careful memory management to remove overhead in computation graph construction, and supports efficient execution on both CPU and GPU.
+The main implementation of computation graph is in under [`src/graph`](api/dir_src_graph.html#dir-src-graph) directory.  
+
+Building blocks for graphs:
+
+- [graph construction](#graph-construction)
+- [node types](#node-types)
+- [graph execution](#graph-execution)
+
+## Graph construction
+
+What is a computation graph? 
+All the numerical computations are expressed as a computation graph. 
+A computation graph (or graph in short) is a series of operations arranged into a graph of nodes. 
+To put it simply, a graph is just an arrangement of nodes that represent what you want to do with the data. 
+
+**Example 1**
+
+Suppose you want to calculate the expression: `z=x*y+sin(x)`. 
+
+The computation graph of this expression is something like Figure 1.
+
+![fig1](images/graph_example1.jpg "Figure 1 An example of computation graph")
+
+*Figure 1 An example of computation graph*
+
+In Marian, the `ExpressionGraph` class is the main implementation of a computation graph.
+An `ExpressionGraph` object keeps a record of data (tensors) and all operations in a directed graph consisting of `Node` objects. 
+A `Node` is the basic unit of a graph. It can be an operation (e.g., dot()), or a tensor. 
+Each operation in a graph is a `NaryNodeOp` (a child of `Node` class). 
+Each operation defines its forward and backward steps. 
+Except for operations, a Node can also be a constant tensor (`ConstantNode`) or a parameter tensor (`ParamNode`). 
+
+To create a graph, we use `New<>` shortcut in place of regular constructors:
+
+```cpp
+// create a graph
+auto graph = New<ExpressionGraph>();
+```
+
+After creating a graph, we also need to initialise the graph object with device options by `setDevice()` and workspace memory by `reserveWorkspaceMB()`, otherwise the program will result in a crash.
+
+```cpp
+// initialise graph with device options
+// here we specify device no. is 0
+// device type can be DeviceType::cpu or DeviceType::gpu
+graph->setDevice({0, DeviceType::cpu}); 
+// preallocate workspace memory (MB) for the graph
+graph->reserveWorkspaceMB(128); 
+```
+The _workspace memory_ means the size of the memory available for the forward and backward step of the training procedure. 
+This does not include model size and optimizer parameters that are allocated outsize workspace. 
+Hence you cannot allocate all device memory to the workspace.
+
+To create a graph, Marian offer a set of shortcut functions that implements the common expression operators for a neural network (see [`src/graph/expression_operators.h`](api/program_listing_file_src_graph_expression_operators.h.html)), such as `affine()`. 
+These functions actually construct the corresponding operation nodes in the graph, make links with other nodes. 
+E.g., `affine()` construct a `AffineNodeOp` node in the graph. 
+Thus, building a graph turns into a simple task of defining expressions by using those functions. 
+
+**Building graph of Example 1 using Marian**
+
+The following code is used to build the graph in Example 1 with inputs `x=2` and `y=3`.
+
+```cpp
+// create and initialise a graph object
+auto graph = New<ExpressionGraph>();
+graph->setDevice({0, DeviceType::cpu});
+graph->reserveWorkspaceMB(8);
+// add input node x
+auto x = graph->constant({1,1}, inits::fromValue(2));
+// add input node y
+auto y = graph->constant({1,1}, inits::fromValue(3));
+// define expression
+auto mulOp = x*y;
+auto sinOp = sin(x);
+auto z = mulOp + sinOp;
+// You can also define this expression: auto z = x*y + sin(x);
+```
+
+For the above example, `constant()` is used to construct a constant node (a tensor) in the graph as the input. 
+We will give more details about this function in the next section [**Node types**](#node-types). 
+The operators `*`, `+` and function `sin()` add corresponding operation nodes (i.e., `MultNodeOp` and `SinNodeOp`) in the graph.
+
+To check the graph, Marian offers `graphviz()` function to generate graph layout in Graphviz format for visualisation. 
+This visualisation might not be practical for real-size graphs due to an enormous number of nodes and layers. 
+You can print the graph layout on console by running the following code:
+
+```cpp
+// print the graph layout on console
+std::cout<<graph->graphviz()<<std::endl;
+```
+
+**Graph visualisation of Example 1**
+
+The resulting graph is shown in Figure 2. Here we use an online Graphviz editor [edotor](https://edotor.net/) to generate the graph (by pasting the output of `graphviz()`).
+
+![fig2](images/example1_dot.png "Figure 2 Graph layout of Example 1")
+
+*Figure 2 Graph layout of Example 1*
+
+In Figure 2, there are two numbers (between the pair of parentheses) in each node. 
+The first number indicates the node ID, and the second number specifies whether the node is trainable (0 means no; 1 means yes). 
+We will cover the concept of *trainable* in [**ParamNode section**](#paramnode).
+
+One thing to notice here is that Marian adopts dynamic computation graphs; 
+this means that the nodes will be consumed once performing forward or backwards pass. 
+Thus, we need to call `graphviz()` function before performing the computation.
+
+## Node types
+
+As mentioned earlier, `Node` is the basic unit of a graph. 
+Each `Node` defines its forward steps in `Node::forward()` and backward steps in `Node::backward()`. 
+To access the resulting new tensor in the forward pass, we can call `Node::val()`. 
+While `Node::grad()` returns the accumulated gradients (a tensor) in the backward pass.
+There are three main classes of Node in Marian: `ConstantNode`, `ParamNode` and `NaryNodeOp`.
+
+### ConstantNode
+
+The `ConstantNode` class is used to construct a constant node in the graph. 
+A constant node is actually a constant tensor whose value is immutable during the training. 
+A `ConstantNode` instance is usually used to construct the input layer.
+To construct a constant node in the graph, we can use `constant()` function in the `ExpressionGraph` class. 
+We need to specify the shape and element type for the constant node. 
+For the shape, we can initialise a `Shape` instance in the way of vector initialisation. 
+E.g., `Shape shape={2,3};` this means 2D matrix with `dim[0]`=2 and `dim[1]`=3.
+The element type must be one of the values stored in `Type` enumeration. 
+`Type` stores all supported data type in Marian, e.g., `Type::float16`.
+If the type is not specified, the default type of graph will be used. 
+The default type of the graph is usually `Type::float32` unless you change it by `setDefaultElementType()`. 
+
+```cpp
+// construct a constant node in the graph with default type
+auto x = graph->constant({N, NUM_FEATURES}, inits::fromVector(inputData));
+```
+
+For the above example, the shape of the constant node is `{N, NUM_FEATURES}`, and the value of the constant node is initialised from a vector `inputData`.
+`inits::fromVector()` returns a `NodeInitializer` which is a functor used to initialise a tensor by copying from the given vector. 
+More functions used to initialise a node can be found in [`src/graph/node_initializers.h`](api/namespace_marian__inits.html#namespace-marian-inits) file. 
+Marian also provides some shortcut functions to construct special constant nodes, such as `ones()` and `zeros()`:
+
+```cpp
+// construct a constant node with 1
+auto ones = graph()->ones({10,10});
+// construct a constant node with 0
+auto zeros = graph()->zeros({10,10});
+```
+
+### ParamNode
+
+`ParamNode` is used to store model parameters whose value can be changed during the training, such as weights and biases. 
+In addition to the shape and the element type, we need to specify whether a `ParamNode` object is _trainable_ or not. 
+If a parameter node is _trainable_, then its value will be tracked and updated during the training procedure.
+For a `ParamNode`, the default value of `trainable_` is `true`. 
+We can define whether this parameter node is trainable by `Node::setTrainable()` function.
+To construct a parameter node in the graph, we use the `param()` function in the `ExpressionGraph` class. 
+For a parameter node, we need to specify its name.
+
+```cpp
+// construct a parameter node called W1 in the graph
+auto W1 = graph->param("W1", {NUM_FEATURES, 5}, inits::uniform(-0.1f, 0.1f));
+```
+
+The parameter node `W1` has a shape of `{NUM_FEATURES, 5}`, and is initialised with random numbers from the uniform distribution `Uniform(-0.1, 0.1)`. 
+
+### NaryNodeOp
+
+`NaryNodeOp` is the base class that defines the operations in a graph. 
+It mainly contains unary and binary operators. 
+Each `NaryNodeOp` defines its forward operations in `Node::forwardOps()` and backward operations in `Node::backwardOps()`. 
+In the current version of Marian, we provide a set of common operations (inherited from `NaryNodeOp`) used to build a neural network, 
+such as `AffineNodeOp` (affine transformation), `CrossEntropyNodeOp` (cross-entropy loss function) and `TanhNodeOp` (tanh activation function). 
+As mentioned earlier, Marian implements a set of APIs that can easily add operations to the graph. 
+E.g., we can use `affine()` to perform affine transformation and then `tanh()` to perform tanh activation function on the results:
+
+```cpp
+// perform affine transformation: x*W1+b
+// and then perform tanh activation function
+auto h = tanh(affine(x, W1, b1));
+```
+
+In the above example, `affine()` and `tanh()` actually add `AffineNodeOp` and `TanhNodeOp` nodes to the graph. 
+For more shortcut functions used to add operations in the graph, you can find in [`src/graph/expression_operators.h`](api/program_listing_file_src_graph_expression_operators.h.html) file.
+
+## Graph execution
+
+Once you finish building a graph by adding all the nodes, now you can perform the real computation.
+
+### Forward pass
+
+The forward pass refers to the calculation process. 
+It traverses through all nodes from the input layer (leaves) to the output layer (root). 
+To perform the forward pass, you can call the function `forward()`. The `forward()` function mainly does two things:
+
+- allocates memory for each node (`Node::allocate()`)
+- computing the new tensor for each node by performing required operations (`Node::forward()`), and the resulting new tensor is stored in `val_` attribute in each Node.
+
+**Forward pass of Example 1**
+
+To run the forward pass of Example 1, you can run the following code:
+
+```cpp
+// Perform the forward pass on the nodes of the graph
+graph->forward();
+// get the computation result of z
+std::vector<float> w;
+z->val()->get(w);
+std::cout<<"z="<<w[0]<<std::endl;
+// The output is: z=6.9093
+```
+
+### **Backward pass**
+
+The backward pass refers to the process of computing the output error. 
+It traverses through all *trainable* nodes from the output layer to the input layer. 
+You can call `backward()` to perform the backward pass. 
+The `backward()` function mainly computes the gradients using the chain rule:
+
+- allocates memory and initialise gradients for each *trainable* Node
+- computes the gradients based on backward steps (`Node::backwardOps()`) from each Node, and stores them in `adj_` attribute in each Node
+- using the chain rule, propagates all the way to the input layer
+
+We also provide a shortcut function `backprop()` which performs first the forward pass and then the backward pass on the nodes of the graph:
+
+```cpp
+// Perform backpropagation on the graph
+graph->backprop();
+// This function is equal to the following code:
+/* 
+   graph->forward();
+   graph->backward();
+*/
+```
+
+**Backward pass of modified Example 1**
+
+As shown in Figure 2, there is no trainable node in the graph of Example 1; 
+this means we cannot perform backwards pass on this graph. 
+To demonstrate the backward pass, we modify Example 1 by changing the constant node `x` to a parameter node (change `constant()` to `param()`). 
+Here is the modification:
+
+```cpp
+// add parameter node x
+auto x = graph->param("x", {1,1}, inits::fromValue(2));
+```
+
+The resulting graph is also different as displayed in Figure 3.
+
+
+![fig3](images/example1_dot2.png "Figure 3 Graph layout of modified Example 1")
+
+*Figure 3 Graph layout of modified Example 1*
+
+To perform the backward pass of modified Example 1, you can run the following code:
+
+```cpp
+// Perform the backward pass on the trainable nodes of the graph
+graph->backward();
+// get the gradient of x node
+std::vector<float> b;
+x->grad()->get(b);
+std::cout<<"dz/dx="<<b[0]<<std::endl;
+// The output is: dz/dx=2.58385
+```
+
+### Optimiser
+
+After the backward pass, we obtain the gradients of the leaves. 
+However, the job is not done yet. 
+To train a model, we need to update the model parameters according to the gradients.
+This comes to how we define the loss function and optimiser for the graph. 
+
+A loss function is used to calculate the model error between the predicted value and the actual value. 
+The goal is to minimise this error during training. 
+In a graph, the loss function is also represented as a group of node(s). 
+You can also use the operators provided in [`expression_operators.h`](api/program_listing_file_src_graph_expression_operators.h.html) file to define the loss function. 
+E.g., Marian offers `cross_entropy()` function to compute the cross-entropy loss between true labels and predicted labels.
+
+**Define a loss function for modified Example 1**
+
+Suppose we know the actual value of `z` is 6 with `y = 3`, and `x` is the parameter we would like to learn from the model. 
+The loss function we choose here is the absolute error:
+
+```cpp
+// pass the actual value to the model
+auto actual = graph->constant({1,1}, inits::fromValue(6));
+// define loss function
+auto loss = abs(actual-z);
+```
+
+The graph is changed to Figure 4. 
+
+![fig4](images/example1_dot3.png "Figure 4 Graph layout of modified Example 1 with loss function")
+
+*Figure 4 Graph layout of modified Example 1 with loss function*
+
+The purpose of the optimiser is to adjust the variables to fit the data. 
+In Marian, there are three built-in optimiser classes: `Sgd`, `Adagrad` and `Adam`.
+`Sgd` is an optimiser based on [stochastic gradient descent](https://en.wikipedia.org/wiki/Stochastic_gradient_descent). 
+For each iteration, it updates the parameter `w` according to the rule of `w = w - learning_rate * gradient`. 
+`Adagrad` implements [Adagrad algorithm](https://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf), 
+an optimiser with parameter-specific learning rates, which are adapted relative to how frequently a parameter gets updated during training.
+`Adam` is an implementation of the [Adam algorithm](https://arxiv.org/abs/1412.6980),
+ a stochastic gradient descent method that is based on an adaptive estimation of first-order and second-order moments. . 
+We use `Optimizer<>` to set up an optimiser with the learning rate:
+
+```cpp
+// Choose optimizer (Sgd, Adagrad, Adam) and initial learning rate
+auto opt = Optimizer<Adam>(0.01);
+```
+
+After an iteration of backpropagation, we can call `update()` function to update the parameters:
+
+```cpp
+// update parameters in the graph
+opt->update(graph);
+```
+
+**Set up an optimiser for modified Example 1**
+
+Continue with Example 1, we choose `Sgd` as the optimiser and update the parameter `x`:
+
+```cpp
+// set up Sgd optimiser with 0.005 learning rate
+auto opt = Optimizer<Sgd>(0.005);
+// update parameters
+opt->update(graph);
+// get the new value of x
+std::vector<float> v;
+x->val()->get(v);
+std::cout<<"x="<<v[0]<<std::endl;
+// The output is: x=1.98708
+```
+### Debugging
+For debugging, we can call `debug()` to print node parameters. The `debug()` function has to be called prior to graph execution. 
+Once a node is marked for debugging, its value (resulting tensor) and the gradient will be printed out during the forward and backward pass. 
+It is also recommended to turn on Marian logger by calling `createLoggers()` for more information.
+
+**Debugging for modified Example 1**
+
+Suppose we want to check the results of node `x` during the computation. We can call `debug()` to mark node `x` for debugging. 
+```cpp
+// mark node x for debugging with logging message "Parameter x"
+debug(x, "Parameter x");
+```
+The output is shown as follows with `createLoggers()`:
+```cpp
+[2021-02-16 15:10:51] [memory] Reserving 256 B, device gpu0
+[2021-02-16 15:10:51] Debug: Parameter x op=param
+[2021-02-16 15:10:51] shape=1x1 size=1 type=float32 device=gpu0 ptr=140505547538432 bytes=256
+min: 2.00000000 max: 2.00000000 l2-norm: 2.00000000
+[[   2.00000000 ]]
+
+[2021-02-16 15:10:51] [memory] Reserving 256 B, device gpu0
+[2021-02-16 15:10:51] Debug Grad: Parameter x op=param
+[2021-02-16 15:10:51] shape=1x1 size=1 type=float32 device=gpu0 ptr=140505547538944 bytes=256
+min: 2.58385324 max: 2.58385324 l2-norm: 2.58385324
+[[   2.58385324 ]]
+```
+
+### More advanced
+
+For more details about graph execution, a graph keeps track of all the `Node` objects in its `nodesForward_` and `nodesBackward_` lists. 
+`nodesForward_` contains all nodes used for the forward pass and `nodesBackward_` contains all trainable nodes used for the backward pass. 
+All the tensor objects for a graph are stored in its `tensors_` attribute. 
+`tensors_` is a shared pointer holding memory and nodes for a graph. 
+Since each `Node` can result in new tensors, this attribute is used to allocate memory for new tensors during the forward and backward pass.
+This `tensors_` attribute gets cleared before a new graph is built. 
+Another important attribute in `ExpressionGraph` is `paramsByElementType_`. 
+This attribute holds memory and nodes that correspond to graph parameters.
+You can call `params()` function in a graph to get all the parameter objects:
+
+```cpp
+// return the Parameters object related to the graph
+// The Parameters object holds the whole set of the parameter nodes.
+graph->params();
+```
+
+Besides, we provide APIs to support the mechanism of Gradient Checkpointing. 
+This method works by trading compute for memory, which reruns a forward-pass segment for each checkpoint segment during the backward pass. 
+Currently, Marian only supports setting checkpoint nodes manually by calling `Node::markCheckpoint()` or `checkpoint()`. 
+To enable the gradient-checkpointing mode for a graph, we use `setCheckpointing()`:
+
+```cpp
+// enable gradient-checkpointing for a graph
+graph->setCheckpointing(true);
+```
+
+We can also save and load the parameters of a graph in Marian. 
+We can call `save()` to save all parameters in the graph into a file (`.npz` or `.bin` format). 
+The function `load()` can load all model parameters to the graph (either from an array of `io::Items`, a file or a buffer).
+
+```cpp
+// specify the filename
+std::string filename = "my_model.npz";
+// save all the parameters into a file
+graph->save(filename);
+// load model from a file
+graph->load(filename);
+```
--- a/doc/images/example1_dot.png
+++ b/doc/images/example1_dot.png
--- a/doc/images/example1_dot2.png
+++ b/doc/images/example1_dot2.png
--- a/doc/images/example1_dot3.png
+++ b/doc/images/example1_dot3.png
--- a/doc/images/example2.png
+++ b/doc/images/example2.png
--- a/doc/images/graph_example1.jpg
+++ b/doc/images/graph_example1.jpg
--- a/doc/index.rst
+++ b/doc/index.rst
@ -0,0 +1,47 @@
+Welcome to Marian's documentation!
+==================================
+
+|buildgpu| |buildcpu| |tests| |release| |license|
+
+Marian is an efficient and self-contained Neural Machine Translation framework with an integrated
+automatic differentiation engine based on dynamic computation graphs, written entirely in C++.
+
+This is developer documentation. User documentation is available at https://marian-nmt.github.io/docs/
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   graph
+   operators
+
+   api/library_index
+
+   contributing
+
+
+Indices and tables
+------------------
+
+* :ref:`genindex`
+
+
+.. |buildgpu| image:: https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-dev-cuda-10.1.svg?label=CUDAC%20Build
+   :target: http://vali.inf.ed.ac.uk/jenkins/job/marian-dev/
+   :alt: GPU build status
+
+.. |buildcpu| image:: https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-dev-cpu.svg?label=CPU%20Build
+   :target: http://vali.inf.ed.ac.uk/jenkins/job/marian-dev-cpu/
+   :alt: CPU build status
+
+.. |tests| image:: https://img.shields.io/jenkins/s/http/vali.inf.ed.ac.uk/jenkins/view/marian/job/marian-regression-tests.svg?label=Tests
+   :target: http://vali.inf.ed.ac.uk/jenkins/job/marian-regression-tests/
+   :alt: Tests status
+
+.. |release| image:: https://img.shields.io/github/release/marian-nmt/marian.svg?label=Release
+   :target: https://github.com/marian-nmt/marian/releases
+   :alt: Latest release
+
+.. |license| image:: https://img.shields.io/badge/License-MIT-blue.svg
+   :target: ../LICENSE.md
+   :alt: License: MIT
--- a/doc/make.bat
+++ b/doc/make.bat
@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
--- a/doc/operators.md
+++ b/doc/operators.md
@ -0,0 +1,553 @@
+# Operations in the Expression Graph
+
+Operations are responsible for manipulating the elements of an expression graph.
+In Marian, many useful operations have already been implemented and can be found
+the code documentation. The provided operations cover simple arithmetic, logical
+comparisons and common mathematical functions; as well as tensor manipulation,
+for example `slice` or `reshape`, and aggregations such as `sum` or `minimum`.
+Finally, other routines, such as activation functions, useful in building
+neutral networks are also available.
+
+There are several necessary components required to implement an operation in
+Marian's expression graph. The highest-level component is the Expression
+Operator, responsible for setting up the Node Operator and adding it to the
+graph. Next, this Node Operator describes the nature of the forward and backward
+operation to be performed. These operations are implemented using some
+combination of Functional Operators (element wise), and Tensor Operators.
+
+This overview aims to provide information about what each of the different
+operator components does, how they fit together and where to go to make changes.
+Then, equipped with this knowledge, to be able to add new functionality to
+Marian.
+
+## Operator Structure
+
+The central component in the graph is the `Chainable<Tensor>` object. This
+object provides the abstract interface necessary to interact with elements in
+the computation graph. The details of this interface can be found in
+[/src/graph/chainable.h](api/file_src_graph_chainable.h.html). Note that the
+template parameter corresponds to the underlying data structure, which in Marian
+is the `Tensor`. Therefore, for convenience, the type `Expr` is defined:
+
+```cpp
+typedef IPtr<Chainable<Tensor>> Expr;
+```
+
+The implementation of the different operator components are divided across
+several files:
+
+  - Expression Operator
+    - [/src/graph/expression_operators.h](api/file_src_graph_expression_operators.h.html)
+    - [/src/graph/expression_operators.cpp](api/file_src_graph_expression_operators.cpp.html)
+  - Node Operator
+    - [/src/graph/node_operators_unary.h](api/file_src_graph_node_operators_unary.h.html)
+    - [/src/graph/node_operators_binary.h](api/file_src_graph_node_operators_binary.h.html)
+    - [/src/graph/node_operators_tuple.h](api/file_src_graph_node_operators_tuple.h.html)
+  - Functional Operator
+    - [/src/functional/operators.h](api/file_src_functional_operators.h.html)
+  - Tensor operation
+    - [/src/tensors/tensor_operators.h](api/file_src_tensors_tensor_operators.h.html)
+    - [/src/tensors/cpu/tensor_operators.cpp](api/file_src_tensors_cpu_tensor_operators.cpp.html)
+    - [/src/tensors/gpu/tensor_operators.cu](api/file_src_tensors_gpu_tensor_operators.cu.html)
+  - Declared Specialization
+    - [/src/tensors/gpu/element.inc](api/program_listing_file_src_tensors_gpu_element.inc.html)
+    - [/src/tensors/gpu/add.inc](api/program_listing_file_src_tensors_gpu_add.inc.html)
+    - [/src/tensors/gpu/add_all.inc](api/program_listing_file_src_tensors_gpu_add_all.inc.html)
+
+To understand how the different components are inter-linked, we'll look at each
+of them in turn.
+
+
+## Expression Operator
+
+The expression operator is the user-facing method used when building a graph. It
+is responsible for constructing the corresponding Node Operation and inserting
+it into the expression graph. To accommodate these core requirements, the
+function `Expression` is able to perform both actions in generality:
+
+```cpp
+template <class T, typename... Args>
+Expr Expression(Args&&... args) {
+  auto e = Expr(new T(std::forward<Args>(args)...));
+  return e->graph()->add(e);
+}
+```
+
+This helper-function simplifies the definition of many expression operators. For
+example, the implementation of the expression operator `sin(x)` is simply:
+
+```cpp
+// src/graph/expression_operators.h
+Expr sin(Expr x);
+
+// src/graph/expression_operators.cpp
+Expr sin(Expr x) {
+  return Expression<SinNodeOp>(x);
+}
+```
+
+However, implementations may perform actions beyond the core functionality
+alone. Taking `sum` as an example
+
+```cpp
+Expr sum(Expr a, int ax) {
+  if(a->shape()[ax] == 1) {
+    return a;
+  }
+  return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::sum);
+}
+```
+
+The trivial operation is handled without needing to construct a node operation.
+This example also demonstrates a non-trivial construction of `ReduceNodeOp`,
+which is capable of performing differing reduction operations depending on
+instantiation.
+
+Going further, an expression operator may be defined in terms of existing
+expressions. Operators such as `weighted_average` are composed of three
+different expression operator calls: `scalar_product`, `sum`, and `operator/`.
+
+```cpp
+Expr weighted_average(Expr in, Expr weights, int ax) {
+  auto p = scalar_product(in, weights, ax);
+  auto s = sum(weights, ax);
+  return p / s;
+}
+```
+
+While useful, composition at this level may be less efficient than lower-level
+implementations.
+
+
+## Node Operator
+
+The `Node` subclass of `Chainable<Tensor>` provides concrete implementations for
+much of the abstract interface, while subclasses of `Node` enable different node
+behaviours. In the context of operations, the relevant derived class is
+`NaryNodeOp` and is base class used for Node Operators. This subclass provides
+implementation focused on performing general N-arity operations. However, many
+common operations are unary and, for convenience, a further specialization,
+`UnaryNodeOp`, exists to simplify their definition.
+
+The purpose of the Node Operator is to define the forward and backward behaviour
+of the operation. The forward operation performs the desired operation while the
+backward operation updates the gradients. These behaviours are written in terms
+of `NodeOps`, where a `NodeOp` is a wrapper to define a capturing lambda
+function. Explicitly these are defined as:
+
+```cpp
+// src/graph/chainable.h
+#define NodeOp(op) [=]() { op; }
+typedef std::vector<std::function<void()>> NodeOps;
+```
+
+Each `NodeOp` is written as a function in terms of the value (`val_`), gradient
+(`adj_`) of the current node, and its children, via `child()`. The values and
+gradients the n<sup>th</sup> child node are accessed via the interfaces
+`child(n)->val()` and `child(n)->grad()`, respectively. NodeOps are executed in
+order when running the graph forwards and backwards, as this snippet from `Node`
+demonstrates
+
+```cpp
+// Node in src/graph/node.h
+virtual void runForward(const NodeOps& ops) {
+  for(auto&& op : ops)
+    op();
+}
+
+virtual void runBackward(const NodeOps& ops) {
+  size_t i = 0;
+  for(auto&& op : ops)
+    if(child(i++)->trainable())
+      op();
+}
+```
+
+In backwards operation it is **crucial** that the `NopeOp` responsible for
+propagating a gradient to `child(i)` is the i<sup>th</sup> element of the
+NodeOps vector. The requirement that the child associated with the NodeOp be
+trainable means that an out-of-position NodeOp may not be run. To represent no
+operation a `nullptr` can be passed as a NodeOp.
+
+A typical node operator has the functionality demonstrated in the following
+snippet.
+
+```cpp
+// outline of a node op
+struct MyNodeOp : public NaryNodeOp {
+  MyNodeOp(Expr a)
+    : NaryNodeOp({a}, newShape(...), newType(...)) {}
+
+  Shape newShape(...) {}  // optional
+  Type newType(...) {}    // optional
+
+  const std::string type() override { return "my_node_op"; }
+  virtual size_t hash() override {}          // potentially required
+  virtual bool equal(Expr node) override {}  // potentially required
+
+  NodeOps forwardOps() override {}
+  NodeOps backwardOps() override {}
+```
+
+This outline describes a node operator that takes a single argument `a`. The
+shape and type of the node would be determined by the result of `newShape` and
+`newType` when constructing the `NaryNodeOp`. These functions represent any
+custom logic used to determine the shape and type of the node. As indicated in
+this example code, these are optional and, when omitted, calling
+`NaryNodeOp({a})` would result in a node with the same shape and type as `a`.
+The `type()` method returns the friendly name for the node. Note that the
+[ONNX](https://onnx.ai)
+[interface](api/program_listing_file_src_onnx_expression_graph_onnx_serialization.cpp.html)
+maintains a mapping of these friendly names to their ONNX representation. In the
+absence of any member variables the `hash()` and `equal()` methods can be
+omitted, and defer to their `NaryNodeOp` definition. However, if such variables
+exist then `hash()` should implement a hashed representation and `equal()`
+should provide the necessary conditions to consider nodes equivalent. Finally,
+the operations of the node are defined in `forwardOps()` and `backwardOps()`.
+
+Continuing with the example of `sin(x)`, the code responsible for implementing
+the behaviour is
+
+```cpp
+// src/graph/node_operators_unary.h
+struct SinNodeOp : public UnaryNodeOp {
+  SinNodeOp(Expr x) : UnaryNodeOp(x) {}
+
+  NodeOps forwardOps() override {
+    using namespace functional;
+    return {NodeOp(Element(_1 = sin(_2), val_, child(0)->val()))};
+  }
+
+  NodeOps backwardOps() override {
+    using namespace functional;
+    return {NodeOp(Add(_1 * cos(_2), child(0)->grad(), adj_, child(0)->val()))};
+  }
+
+  const std::string type() override { return "sin"; }
+};
+```
+
+In this code, the constructor trivially initialises the `UnaryNodeOp`, passing
+the expression `x` as its input. This propagates up to `NaryNodeOp` and becomes
+`child(0)` of the node. The size and type of the SinNodeOp are equivalent to
+that of `x`. The lack of any member variables allows the `hash()` and `equal()`
+methods to be omitted. The friendly name for this node is the string `sin`. The
+forward and backward implementation are accomplished using a single NodeOp each.
+
+### Forward operation
+
+The forward NodeOp calls the tensor operation Element, that execute the
+element-wise operation described by the functor:
+
+```cpp
+_1 = sin(_2)
+```
+
+The placeholders `_1`, `_2` are enabled by code in
+[/src/functional](api/dir_src_functional.html) and interoperate with the
+functional operators. In the call to `Element`, `val_` is assigned to `_1` and
+`child(0)->val()` to `_2`. Therefore, this has the action of setting the
+elements of this node to the result obtained by applying `sin` to the elements
+of `child(0)`.
+
+### Backward Operation
+
+The backward NodeOp is responsible for backpropagation of the gradients via
+reverse-mode automatic differentiation. In this example, where `y = sin(x)`,
+this corresponds to evaluating
+
+```
+dJ/dx += dJ/dy * dy/dx, dy/dx = cos(x)
+```
+
+This is realised using the tensor operator `Add` with the functor
+
+```cpp
+_1 * cos(_2)
+```
+
+In the call to `Add`, `adj_` is assigned to `_1` and `child(0)->val()` to `_2`.
+Therefore, this functor represents `dJ/dy * dy/dx`: the product of the gradient
+at the current node and the gradient of the operation. This value is then added
+to the gradient of the child `child(0)->grad()` as required.
+
+### Shape and Type Changes
+
+The `newShape` and `newType` methods are just a suggestion of how custom logic
+may be encapsulated where needed. However, in practice, many operations do not
+require a change in shape or type. In these instances, the node inherits the
+broadcasted shape of its children as well as their common type. An important
+feature of the type deduction in `NaryNodeOp::commonType()` is that it
+guarantees that all child nodes are of the same type.
+
+There are few operations in Marian that require a type specification. Where they
+do exist, they are often simple as the desired type is explicitly provided, or
+is trivially deduced. An example of this is `CastNodeOp`
+
+```cpp
+// CastNodeOp in src/graph/node_operators_unary.h
+CastNodeOp(Expr a, Type type) : UnaryNodeOp(a, type) {}
+```
+
+The desired type is set explicitly in construction. A slightly different example
+is that of `CSRDotNodeOp`. It has several child nodes which are a mixture of
+`DataType` and `IndexType` and therefore do not share a common type. The
+solution is to explicitly specify the relevant children to
+`NaryNodeOp::commonType({...})`.
+
+Shape modifying operations are more common. A simple example is the class of
+operations performed by `ReduceNodeOp` which involve an aggregation process
+along one axis of the Tensor. The output shape is determined by
+
+```cpp
+// ReduceNodeOp in src/graph/node_operators_unary.h
+Shape newShape(Expr a, int axis) {
+  Shape shape = a->shape();
+  axis_ = shape.axis(axis);
+
+  shape.set(axis_, 1);
+  return shape;
+}
+```
+
+The output shape is the same as the input but with the processed axis is reduced
+to a single element. Other use cases include transpose and slicing operations,
+as well as tensor products.
+
+
+## Functional Operator
+
+As the NodeOp are evaluated, they encounter the underlying datatype of the
+`Tensor`. At this stage, type-specific intrinsic functions are required. These
+intrinsics are implemented in the templated struct `Ops<ElementType>`, with a
+specialization required for each type. The current required types are:
+  - float
+  - double
+  - float32x4 (see `src/3rd_party/sse_mathfun.h`)
+  - float32x8 (see `src/3rd_party/avx_mathfun.h`)
+  - half (see `cuda_fp16.h` in the CUDA Math API)
+
+Further details are available in
+[/src/common/types.h](api/file_src_common_types.h.html).
+
+Returning to the example of `sin(x)`, the specialization for `float` and
+`double` requires
+
+```cpp
+// src/functional/operators.h
+// in namespace marian::functional
+template <typename T>
+struct Ops {
+  static HOST_DEVICE_INLINE T sin(const T&)  { ABORT("Unknown type"); }
+};
+
+// Specialization for float
+template <>
+struct Ops<float> {
+  static HOST_DEVICE_INLINE float sin(const float& x)  { return sinf(x); }
+};
+
+// Specialization for double
+template <>
+struct Ops<double> {
+  static HOST_DEVICE_INLINE double sin(const double& x)  { return std::sin(x); }
+};
+```
+
+The remaining specializations can be seen in
+[/src/functional/operators.h](api/file_src_functional_operators.h.html). Note
+that the general template must produce a runtime abort.
+
+The final component of the functional operator is to call the macro that enables
+interoperability with the framework of
+[/src/functional](api/dir_src_functional.html). For a unary operator, this is
+the macro `UNARY`.
+
+```cpp
+UNARY(Sin,     sin,        Ops<ElementType>::sin(x));
+```
+
+where template parameter `ElementType` **must** be used. There are equivalent
+macros for `BINARY` and `TERNARY` Ops.
+
+
+## Tensor Operator
+
+Tensor operations use less abstracted interfaces to interact with the Tensors,
+often working with the Tensor data directly. They also rely on BLAS (Basic
+Linear Algebra Subprograms) libraries to accelerate these operations. As well as
+libraries containing device-specific optimisations. These libraries include:
+
+  - CPU
+    - CBLAS / OpenBLAS
+    - FBGEMM
+    - INTGEMM
+    - MKL
+  - GPU
+    - CUDA (cuBLAS)
+
+An important subtlety is that while the CPU focused libraries use a row-major
+representation, the cuBLAS library (GPU) instead uses a column-major
+representation.
+
+Furthermore, the OpenMPI and OpenMP libraries are employed for parallelisation.
+While macros provided in
+[/src/common/definitions.h](api/file_src_common_definitions.h.html) locally
+enable faster floating-point math in supported compilers.
+
+```cpp
+MARIAN_FFAST_MATH_BEGIN
+// ffmath code
+MARIAN_FFAST_MATH_END
+```
+
+The usual caveats apply when enabling `fast_math`, and can be found in
+[/src/common/definitions.h](api/file_src_common_definitions.h.html)
+
+Tensor operators are declared in
+[/src/tensors/tensor_operators.h](api/file_src_tensors_tensor_operators.h.html),
+these are device-agnostic function that call the relevant device-specific
+implementation. The CPU- and GPU-specific implementation are defined in `cpu`
+namespace in [/src/tensors/cpu/](api/dir_src_tensors_cpu.html) and the `gpu`
+namespace [/src/tensors/gpu/](api/dir_src_tensors_gpu.html). Therefore a typical
+operator defers to an implementation in the device-specific namespace.
+
+```cpp
+void TensorOp(marian::Tensor out, marian::Tensor in) {
+#ifdef CUDA_FOUND
+  if(out->getBackend()->getDeviceId().type == DeviceType::gpu)
+    gpu::TensorOp(out, in);
+  else
+#endif
+    cpu::TensorOp(out, in);
+}
+```
+
+When compiled with GPU support, this function dispatches a call to the
+implementation that corresponds to the backend device type configured in the
+graph (either GPU or CPU). Without GPU support, only the CPU implementation is
+available.
+
+Many operations are covered by three general tensor operators: `Element`,
+`Aggregate` and `Prod`. The `Element` operator applies a function element-wise
+across an arbitrary number of input tensors and stores the result in the output
+tensor. The `Aggregate` operator also applies a function element-wise across its
+inputs, but instead aggregates the results in the output via a given aggregation
+function. A common aggregation function used is addition, which is the basis of
+the `Add` and `Reduce` operators. Finally, `Prod` deals with products of
+tensors. This operator performs a general matrix multiplication with the
+underlying implementation relying on the libraries mentioned above.
+
+Specialized operators exist to manipulation tensors beyond the cases covered
+above; such as under transposition and concatenation. These operators may even
+be expressed in terms of existing tensor operators.
+
+Furthermore, for complicated multi-operation computations, performance gains and
+memory improvements may be realised by implementing a tensor operator for that
+specific purpose. An example of this is `softmax`, which could be implemented
+using multiple expression operators (`exp`, `sum`), but is instead implemented
+directly as a tensor operator. These optimized implementations may be device
+specific.
+
+## Declared Specialization
+
+The operations performed in the forward and backward methods of NodeOp require
+their GPU templates to be explicitly declared. When a new specialization is
+introduced without being explicitly instantiated it will cause a link error on
+compilation:
+
+```
+.../src/tensors/tensor_operators.h:41: undefined reference to `void marian::gpu::Element<marian::functional::Assign< ... > ( ... )'
+```
+
+To fix these undefined references, we must explicitly add the specialization to
+the `.inc` files of [/src/tensors/gpu/](api/dir_src_tensors_gpu.html). Each
+`.inc` file is included at the end of its corresponding `.cu` file, ensuring
+that the specialization is compiled.
+
+The undefined references should be added to the `.inc` file that corresponds to
+the header file in which contains the declaration of the missing functions.
+
+The file [element.inc](api/file_src_tensors_gpu_element.inc.html) contains the
+specializations of the function defined in
+[element.h](api/file_src_tensors_gpu_element.h.html):
+
+```cpp
+// src/tensors/gpu/element.h
+template <class Functor, class... Tensors>
+void Element(Functor functor, Tensor out, Tensors... tensors);
+```
+
+Similarly, [add.inc](api/file_src_tensors_gpu_add.inc.html) contains the
+specializations for functions matching either of the two signatures in
+[add.h](api/file_src_tensors_gpu_add.h.html):
+
+```cpp
+// src/tensors/gpu/add.h
+template <class Functor, class... Tensors>
+void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors);
+
+template <class Functor, class AggFunctor, class... Tensors>
+void Aggregate(Functor functor, float initAgg, AggFunctor aggFunctor, float scale, marian::Tensor out, Tensors... tensors);
+```
+
+Finally [add_all.inc](api/file_src_tensors_gpu_add_all.inc.html) contains the
+specializations for [add_all.h](api/file_src_tensors_gpu_add_all.h.html), which
+are several versions of:
+
+```cpp
+// src/tensors/gpu/add_all.h
+template <typename T, typename AccType, class Functor, class AggFunctor>
+void AggregateAll(Ptr<Allocator> allocator,
+                  Functor functor,
+                  AccType aggInit,
+                  AggFunctor aggFunctor,
+                  AccType scale,
+                  Tensor out,
+                  const Tensor in1);
+```
+
+However, for [add_all.h](api/file_src_tensors_gpu_add_all.h.html), there is an
+additional type dependence in the first template parameter, which requires two
+entries:
+
+```cpp
+marian::gpu::AggregateAll< float, ... >( ... );
+marian::gpu::AggregateAll< __half, ... >( ... );  // for COMPILE_FP16
+```
+
+where the `__half` specialization is related to half-precision floats and should
+be added to the `COMPILE_FP16` preprocessor block.
+
+The simplest method to add the correct specialization is to take the compilation
+error output and extract the needed signature. To extract the signature:
+
+  1. Replace up to, and including, "undefined reference to `" with "template"
+  2. Replace the final ' with a semi-colon
+
+To conform with definitions in the codebase, we should replace
+`IntrusivePtr<marian::TensorBase>` with its typedef `marian::Tensor`. Note that
+as these files are included in `marian::gpu` namespace, and explicitly use
+`marian::functional` namespace it is also possible to omit both of these
+prefixes. Typically, the namespace prefix of the specialized function is removed
+as well. Following these rules for the example of `SinNodeOp` results in the
+following entries:
+
+**element**
+```cpp
+template void Element<Assign<Var<1>, UnaryFunctor<elem::Sin, Assignee<2> > >, marian::Tensor >(Assign<Var<1>, UnaryFunctor<elem::Sin, Assignee<2> > >, marian::Tensor, marian::Tensor);
+```
+
+**add**
+```cpp
+template void Add<BinaryFunctor<elem::Mult,Assignee<1>,UnaryFunctor<elem::Cos,Assignee<2> > >,class marian::Tensor,class marian::Tensor >(BinaryFunctor<elem::Mult,Assignee<1>,UnaryFunctor<elem::Cos,Assignee<2> > >,float,class marian::Tensor,class marian::Tensor,class marian::Tensor);
+```
+
+**add_all**
+```cpp
+template void AggregateAll<float,float,BinaryFunctor<elem::Mult,Assignee<1>,UnaryFunctor<elem::Cos,Assignee<2> > >,BinaryFunctor<elem::Plus,Assignee<1>,Assignee<2> > >(std::shared_ptr<marian::Allocator>,BinaryFunctor<elem::Mult,Assignee<1>,UnaryFunctor<elem::Cos,Assignee<2> > >,float,BinaryFunctor<elem::Plus,Assignee<1>,Assignee<2> >,float,marian::Tensor,marian::Tensor,marian::Tensor);
+
+#if COMPILE_FP16
+template void AggregateAll<__half,float,BinaryFunctor<elem::Mult,Assignee<1>,UnaryFunctor<elem::Cos,Assignee<2> > >,BinaryFunctor<elem::Plus,Assignee<1>,Assignee<2> > >(std::shared_ptr<marian::Allocator>,BinaryFunctor<elem::Mult,Assignee<1>,UnaryFunctor<elem::Cos,Assignee<2> > >,float,BinaryFunctor<elem::Plus,Assignee<1>,Assignee<2> >,float,marian::Tensor,marian::Tensor,marian::Tensor);
+#endif
+```
--- a/doc/requirements.txt
+++ b/doc/requirements.txt
@ -0,0 +1,6 @@
+sphinx==2.4.4
+breathe==4.13.0
+exhale
+sphinx_rtd_theme
+recommonmark
+m2r
--- a/src/common/definitions.h
+++ b/src/common/definitions.h
@ -127,6 +127,7 @@ IPtr<T> INew(Ptr<T> p) {
  return IPtr<T>(p);
 }

+/// enum class DeviceType: defines which device is used for computation
 enum class DeviceType : size_t { gpu = 0, cpu = 1 };

 struct DeviceId {
--- a/src/common/shape.h
+++ b/src/common/shape.h
@ -28,6 +28,14 @@ struct Slice // Python-like slice/index descriptor
 };
 typedef std::vector<Slice> Slices;

+/**
+ * Shape class mainly defines the shape or dimensionality of the node.
+ * Basically, Shape is a wrapper of a std::vector. Its size is the number of
+ * dimension. E.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3.
+ * WHen the index is negative, the real index is size() + index.
+ * It implements most common functions demanded by operations, e.g., resize(),
+ * slice(), and broadcast().
+ */
 struct Shape {
 private:
  std::vector<int> shape_;
--- a/src/common/types.h
+++ b/src/common/types.h
@ -143,7 +143,7 @@ do { \
    default: ABORT("Unknown type {}", type); \
  } \
 } while(0)
-
+/// namespace marian
 namespace marian {

 // small struct to enable templating based on types use for packing
@ -290,36 +290,37 @@ constexpr inline size_t operator+(size_t val, TypeClass typeClass) {
 }

 // @TODO: rename to ElementType when things become stable, so it's easier to review
+/// enum class Type: stores all supported data type in Marian
 enum class Type : size_t {
-  int8     = TypeClass::signed_type + 1u,
-  int16    = TypeClass::signed_type + 2u,
-  int32    = TypeClass::signed_type + 4u,
-  int64    = TypeClass::signed_type + 8u,
+  int8     = TypeClass::signed_type + 1u,      ///< int8 type
+  int16    = TypeClass::signed_type + 2u,      ///< int16 type
+  int32    = TypeClass::signed_type + 4u,      ///< int32 type
+  int64    = TypeClass::signed_type + 8u,      ///< int64 type

-  uint8    = TypeClass::unsigned_type + 1u,
-  uint16   = TypeClass::unsigned_type + 2u,
-  uint32   = TypeClass::unsigned_type + 4u,
-  uint64   = TypeClass::unsigned_type + 8u,
+  uint8    = TypeClass::unsigned_type + 1u,    ///< uint8 type
+  uint16   = TypeClass::unsigned_type + 2u,    ///< uint16 type
+  uint32   = TypeClass::unsigned_type + 4u,    ///< uint32 type
+  uint64   = TypeClass::unsigned_type + 8u,    ///< uint64 type

-  float16  = TypeClass::float_type + 2u,
-  float32  = TypeClass::float_type + 4u,
-  float64  = TypeClass::float_type + 8u,
+  float16  = TypeClass::float_type + 2u,       ///< float16 type
+  float32  = TypeClass::float_type + 4u,       ///< float32 type
+  float64  = TypeClass::float_type + 8u,       ///< float64 type

-  packed16            = TypeClass::packed_type + 2u,                                   // special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint16) is meaningless.
-  packed8avx2         = TypeClass::packed_type + 1u + TypeClass::avx2_type,            // special type for FBGEMM with AVX2, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
-  packed8avx512       = TypeClass::packed_type + 1u + TypeClass::avx512_type,          // special type for FBGEMM with AVX512, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
+  packed16            = TypeClass::packed_type + 2u,                                   ///< special type for FBGEMM, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint16) is meaningless.
+  packed8avx2         = TypeClass::packed_type + 1u + TypeClass::avx2_type,            ///< special type for FBGEMM with AVX2, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.
+  packed8avx512       = TypeClass::packed_type + 1u + TypeClass::avx512_type,          ///< special type for FBGEMM with AVX512, not meant to be used anywhere else, not meant to be accessed invidually. Internal actual type (uint8) is meaningless.

-  intgemm8            = TypeClass::intgemm_type + 1u,                                  // Int8 quantized (not packed) matrices for intgemm
-  intgemm16           = TypeClass::intgemm_type + 2u,                                  // Int16 quantized (not packed) matrices for intgemm
+  intgemm8            = TypeClass::intgemm_type + 1u,                                  ///< Int8 quantized (not packed) matrices for intgemm
+  intgemm16           = TypeClass::intgemm_type + 2u,                                  ///< Int16 quantized (not packed) matrices for intgemm
+  
+  intgemm8ssse3       = TypeClass::intgemm_type + 1u + TypeClass::ssse3_type,          ///< Int8 quantized and packed (ssse3) matrices for intgemm
+  intgemm8avx2        = TypeClass::intgemm_type + 1u + TypeClass::avx2_type,           ///< Int8 quantized and packed (avx2) matrices for intgemm
+  intgemm8avx512      = TypeClass::intgemm_type + 1u + TypeClass::avx512_type,         ///< Int8 quantized and packed (avx512) matrices for intgemm
+  intgemm8avx512vnni  = TypeClass::intgemm_type + 1u + TypeClass::avx512_type + 4096u, ///< Int8 quantized and packed (avx512) matrices for intgemm. VNNI algorithm

-  intgemm8ssse3       = TypeClass::intgemm_type + 1u + TypeClass::ssse3_type,          // Int8 quantized and packed (ssse3) matrices for intgemm
-  intgemm8avx2        = TypeClass::intgemm_type + 1u + TypeClass::avx2_type,           // Int8 quantized and packed (avx2) matrices for intgemm
-  intgemm8avx512      = TypeClass::intgemm_type + 1u + TypeClass::avx512_type,         // Int8 quantized and packed (avx512) matrices for intgemm
-  intgemm8avx512vnni  = TypeClass::intgemm_type + 1u + TypeClass::avx512_type + 4096u, // Int8 quantized and packed (avx512) matrices for intgemm. VNNI algorithm
-
-  intgemm16sse2       = TypeClass::intgemm_type + 2u + TypeClass::sse2_type,           // Int16 quantized and packed (sse2) matrices for intgemm
-  intgemm16avx2       = TypeClass::intgemm_type + 2u + TypeClass::avx2_type,           // Int16 quantized and packed (avx2) matrices for intgemm
-  intgemm16avx512     = TypeClass::intgemm_type + 2u + TypeClass::avx512_type,         // Int16 quantized and packed (avx512) matrices for intgemm
+  intgemm16sse2       = TypeClass::intgemm_type + 2u + TypeClass::sse2_type,           ///< Int16 quantized and packed (sse2) matrices for intgemm
+  intgemm16avx2       = TypeClass::intgemm_type + 2u + TypeClass::avx2_type,           ///< Int16 quantized and packed (avx2) matrices for intgemm
+  intgemm16avx512     = TypeClass::intgemm_type + 2u + TypeClass::avx512_type,         ///< Int16 quantized and packed (avx512) matrices for intgemm
 };

 static inline size_t operator&(TypeClass typeClass, Type type) {
--- a/src/functional/predicates.h
+++ b/src/functional/predicates.h
@ -39,6 +39,12 @@ struct BinaryFunctor {
  }
 };

+/**
+ * Macro to set up unary-functions from marian::functional::Ops.
+ * @param name name for the struct
+ * @param name2 callable typedef
+ * @param func function wrapped
+ */
 #define UNARY(name, name2, func)                                      \
  namespace elem {                                                    \
  struct name {                                                       \
@ -55,6 +61,12 @@ struct BinaryFunctor {
  }                                                                   \
  static inline name<Capture> name2(Capture x) { return name<Capture>(x); }

+/**
+ * Macro to set up binary-functions from marian::functional::Ops.
+ * @param name name for the struct
+ * @param name2 callable typedef
+ * @param func function wrapped
+ */
 #define BINARY(name, name2, func)                                 \
  namespace elem {                                                \
  struct name {                                                   \
@ -95,6 +107,12 @@ struct TernaryFunctor {
  }
 };

+/**
+ * Macro to set up ternary-functions from marian::functional::Ops.
+ * @param name name for the struct
+ * @param name2 callable typedef
+ * @param func function wrapped
+ */
 #define TERNARY(name, name2, func)                                         \
  namespace elem {                                                         \
  struct name {                                                            \
--- a/src/graph/expression_graph.cpp
+++ b/src/graph/expression_graph.cpp
@ -30,7 +30,7 @@ Expr ExpressionGraph::add(Expr node) {
  } else {
    node->setId(count_++);

-    // record in foward graph
+    // record in forward graph
    nodesForward_.push_back(node);

    // record in backward graph if training, and keep track of roots
@ -143,6 +143,11 @@ void ExpressionGraph::forward(std::list<Expr>& forwardTape, bool finalPass) {
    if(inferenceOnly_)
      v->children().clear();

+    // If checkpointing is disabled, keep the memory for forward signals for all nodes.
+    // If checkpointing is enabled:
+    //  (a) In the forward pass before the backward pass, free the memory for the nodes in the subtape to save memory.
+    //  (b) In the forward calls during the backward pass, keep the memory in the current subtape to accelerate
+    //      gradient computation.
    if(checkpointing_ && !finalPass) {
      auto subtape = v->getSubtape();
      if(subtape) {
@ -171,12 +176,14 @@ void ExpressionGraph::backward(bool reset, float clipValue) {
    ABORT("Aborting");
  }

+  // allocates memory and initialises gradients for parameters
  for(auto kvParams : paramsByElementType_) {
    kvParams.second->allocateBackward();
    if(reset)
      kvParams.second->set_zero_adjoint();
  }

+  // for top nodes: allocates memory and initialise gradients to 1
  for(auto&& v : topNodes_)
    v->init_dependent();

@ -186,13 +193,16 @@ void ExpressionGraph::backward(bool reset, float clipValue) {

  bool firstNaN = true;
  while(!nodesBackward_.empty()) {
-    auto v = nodesBackward_.back();
-    nodesBackward_.pop_back();
+    auto v = nodesBackward_.back();  // return the last element
+    nodesBackward_.pop_back();       // remove the last element

+    // for non-top nodes: allocates memory and initialises gradients to 0
    for(auto&& child : v->children())
      if(child->trainable() && child->type() != "param")
        child->set_zero_adjoint();

+    // if using gradient checkpointing,
+    // recompute the forward pass from checkpoint to the root
    if(checkpointing_ && v->getSubtape()) {
      forward(*v->getSubtape(), /*finalPass=*/true);
    }
--- a/src/graph/expression_graph.h
+++ b/src/graph/expression_graph.h
@ -16,9 +16,18 @@

 namespace marian {

+/**
+ * Create an expression node of any type, and pass all
+ * arguments to any available constructor.
+ * E.g., to create a ConstantNode uses `Expression<ConstantNode>(...)`.
+ */
 template <class T, typename... Args>
 Expr Expression(Args&&... args);

+/**
+ * The whole tensor set in the graph.
+ * Holds all tensor objects (memory and nodes) for a graph.
+ */
 class Tensors {
 private:
  Ptr<TensorAllocator> tensors_;
@ -27,8 +36,8 @@ private:
  typedef std::unordered_map<size_t, std::vector<WExpr>> WeakMemory;
  typedef std::unordered_map<size_t, std::vector<Expr>> Memory;

-  Ptr<WeakMemory> shortterm_;
-  Ptr<Memory> longterm_;
+  Ptr<WeakMemory> shortterm_;  // holds all nodes for a graph
+  Ptr<Memory> longterm_;  // holds memoized nodes

 public:
  Tensors(Ptr<Backend> backend)
@ -112,97 +121,145 @@ public:

 typedef std::map<Type, Ptr<Parameters>> ElementTypeParamsMap; // keep it sorted, hence map not unordered map

+/**
+ *  Main implementation of a computation graph.
+ *  Keeps a record of data (tensors) and all operations. Each operation in a computation graph is a Node.
+ *  Each Node defines its forward and backward steps.
+ */
 class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
-  size_t count_{0};
+  size_t count_{0};  // counter for nodes in the graph; hold current node index

-  std::unordered_set<Expr> topNodes_; // current set of roots. In the end, all but one must have been consumed.
+  std::unordered_set<Expr> topNodes_; // current set of roots. In the end, all but one must have been consumed

 protected:  // (these are protected, not private, for ONNX exporting)
-  std::list<Expr> nodesForward_;
-  std::list<Expr> nodesBackward_;
+  std::list<Expr> nodesForward_;     ///< contains all nodes used for forward()
+  std::list<Expr> nodesBackward_;    ///< contains trainable nodes used for backward()

-  // Holds memory and expressions that correspond to temporary expressions.
-  // This gets cleared before a new graph is built.
+  /**
+   * A shared pointer to the tensor objects in the graph.
+   * Holds memory and nodes that corresponds to tensors in a graph.
+   * Since operations will result in new tensors, this attribute is used
+   * to allocate memory for new tensors during forward() and backward().
+   * This gets cleared before a new graph is built.
+   */
  Ptr<Tensors> tensors_;
 private:

  std::unordered_map<size_t, std::vector<Expr>> memoized_;

-  Type defaultElementType_{Type::float32}; // Type used for storing parameters, currently all parameters have to have the same type
+  Type defaultElementType_{Type::float32};  // Type used for storing parameters, currently all parameters have to have the same type

-  bool inferenceOnly_{false};
+  bool inferenceOnly_{false};               // a flag holds whether the graph is used for inference only

-  bool checkpointing_{false}; // use gradient checkpointing if true
+  bool checkpointing_{false};               // use gradient checkpointing if true

-  bool reloaded_{false};
+  bool reloaded_{false};                    // a flag holds whether the graph is reloaded: reloaded is true if the graph loads parameters by load() function.

-  bool throwNaN_{false};
+  bool throwNaN_{false};                    // a flag holds whether the graph throws a NaN exception

 protected:
  // Delete, copy and move constructors
  ExpressionGraph(const ExpressionGraph&) = delete;
  ExpressionGraph(ExpressionGraph&&) = delete;

-  // Holds memory and expressions that correspond to graph parameters
-  // Now we can have multiple types of parameters in a separate parameters object per value type. 
-  // This is currently only accessible through private functions during loading, will abort during training
-  // when params() is called (e.g. optimizer) and there is more or other types than the default parameter type.
-  // Currently the only usecase is inference. Trying to access params() for non-default parameter type is going
-  // to abort. Inference does not need to access a whole set of parameters.
+  /**
+   * A map holds memory and nodes that corresponds to graph parameters.
+   * The key is Type and the mapped value is a set of parameter objects with corresponding type.
+   * Now we can have multiple types of parameters in a separate parameters object per value type.
+   * This is currently only accessible through private functions during loading, will abort during training
+   * when params() is called (e.g. optimizer) and there is more or other types than the default parameter type.
+   * Currently the only usecase is inference. Trying to access params() for non-default parameter type is going
+   * to abort. Inference does not need to access a whole set of parameters.
+   */
  ElementTypeParamsMap paramsByElementType_;
-  Ptr<Backend> backend_;
-
-  std::string namespace_;
+  Ptr<Backend> backend_;      ///< a shared pointer to the backend for the graph
+  std::string namespace_;     ///< a string defines the namespace of the graph. Each graph has its own unique namespace.

 public:
-  /** @brief Constructs a new expression graph
-   *
-   * Constructor should be used as New<ExpressionGraph>()
-   */
+  /** Constructs a new expression graph. Constructor should be used as New<ExpressionGraph>(). */
  ExpressionGraph(bool inference = false);

+  /** Destructor. Clear everything related to the graph except memoized nodes. */
  virtual ~ExpressionGraph() {
    clear();
    for(auto kvParams : paramsByElementType_)
      kvParams.second->clear();
  }

+  /**
+   * Set device options used to run the graph.
+   * @param deviceId a struct type which stores device no. (size_t)
+   * and device type (DeviceType::cpu or DeviceType::gpu)
+   * @param device a pointer to the device
+   */
  virtual void setDevice(DeviceId deviceId = {0, DeviceType::gpu},
                         Ptr<Device> device = nullptr);

+  /**
+   * Get device info for the graph.
+   * @return deviceId a struct type which stores device no. (size_t)
+   * and device type (DeviceType::cpu or DeviceType::gpu)
+   */
  DeviceId getDeviceId() { return backend_->getDeviceId(); }

+  /**
+   * Get backend pointer for the graph.
+   * @return Ptr<Backend> pointer to backend
+   */
  Ptr<Backend> getBackend() { return backend_; }

+  /** Set whether the graph is used for inference only */
  void setInference(bool inference) { inferenceOnly_ = inference; }
+
+  /** Check whether the graph is used for inference only (true) or not */
  bool isInference() { return inferenceOnly_; }

+  /**
+   * Set whether the graph uses gradient checkpointing.
+   * <a href="https://github.com/cybertronai/gradient-checkpointing">Gradient Checkpointing</a>
+   * works by trading compute for memory, which reruns a forward-pass segment for each checkpoint segment during backward.
+   */
  void setCheckpointing(bool checkpointing) { checkpointing_ = checkpointing; }
+
+  /** Check whether the graph uses gradient checkpointing or not */
  bool isCheckpointing() { return checkpointing_; }

+  /**
+   * Set namespace (std::string) for the graph.
+   * Each graph has its own unique namespace, which is used to form the name of a parameter object.
+   */
  void switchParams(const std::string& newNamespace) {
    namespace_ = newNamespace;
  }

+  /**
+   * Copy all parameter objects from one graph to current graph.
+   * @param graph a pointer to a graph object
+   */
  virtual void copyParams(Ptr<ExpressionGraph> graph) {
    for(auto p : *graph->params())
      param(p->name(), p->shape(), inits::fromTensor(p->val()), p->value_type());
-    forward(); // this will allocate parameters, execute the intializers and therefore copy parameter values
+    forward(); // this will allocate parameters, execute the initializers and therefore copy parameter values
  }

+  /**
+   * Preallocate workspace memory (MB) for the graph.
+   * Sets the size of the memory available for the forward and backward step of the training procedure.
+   * This does not include model size and optimizer parameters that are allocated outsize workspace.
+   */
  void reserveWorkspaceMB(size_t num) {
    size_t bytes = num * 1024 * 1024 - 1;
    tensors_->reserve(bytes);
  }

+  /** Copy tensor objects from one graph to current graph */
  void reuseWorkspace(Ptr<ExpressionGraph> graph) {
    tensors_ = graph->tensors_;
  }

  /**
-   * @brief Performs backpropogation on this expression graph.
-   *
-   * Backpropogation is implemented by performing first the forward pass and
+   * Performs backpropagation on this expression graph.
+   * Backpropagation is implemented by performing first the forward pass and
   * then the backward pass of algorithmic differentiation (AD) on the nodes of
   * the graph.
   */
@ -211,6 +268,12 @@ public:
    backward();
  }

+  /**
+   * Perform one backpropagation process on the graph to test
+   * whether the graph workspace fits into a given workspace memory.
+   * This function is used for searching the maximum batch size
+   * that fits into given workspace memory.
+   */
  bool fits() {
    try {
      tensors_->throwAtReallocation(true);
@ -223,19 +286,50 @@ public:
    return true;
  }

+  /**
+   * Check whether the memory allocated for a tensor object contains a NaN or infinite value.
+   * @param t a Tensor object
+   * @param isNaN a bool type holds the result whether the tensor contains a NaN value (pass by reference)
+   * @param isInf a bool type holds the result whether the tensor contains a infinite value (pass by reference)
+   */
  void checkNaN(Tensor t, bool& isNaN, bool& isInf);

+  /**
+   * Perform the forward pass on the nodes of the graph.
+   * The forward pass refers to the calculation process.
+   * It traverses through all nodes from input layer to output layer.
+   */
  void forward() {
    for(auto kvParams : paramsByElementType_)
      kvParams.second->allocateForward();
    forwardNext();
  }

+  /**
+   * Perform the forward pass without memory allocation for parameters.
+   * Helper function for forward().
+   */
  void forwardNext();
+
+  /**
+   * Perform forward pass on a given nodes with finalPass flag.
+   * Helper function for forward() and backward().
+   * @param forwardTape a pointer to the nodes used for forward pass
+   * @param finalPass a bool type which controls whether nodes should be freed with gradient-checkpointing
+   */
  void forward(std::list<Expr>& forwardTape, bool finalPass);

+  /**
+   * Perform the backward pass on the trainable nodes of the graph.
+   * The back pass refers to the process of computing the output error.
+   * It traverses through all nodes from output layer to input layer.
+   */
  void backward(bool reset = true, float clipValue = 0.f);

+  /**
+   * Generate graph layout in Graphviz format for visualisation.
+   * @return a string presenting graph layout in Graphviz format (dot)
+   */
  std::string graphviz() {
    std::stringstream ss;
    ss << "digraph ExpressionGraph {" << std::endl;
@ -253,6 +347,10 @@ public:
    return ss.str();
  }

+  /**
+   * Write graph layout in Graphviz format to a file.
+   * @param filename a string type specifies filename that writes the graph layout
+   */
  void graphviz(const std::string& filename) {
    std::ofstream dot(filename);
    dot << graphviz();
@ -345,6 +443,18 @@ private:
  }

 public:
+
+  /**
+   * Construct a parameter node in the graph.
+   * @param pname a string type holds the name of the parameter node
+   * @param shape a struct type defines the shape of the parameter tensor
+   *        e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
+   * @param init a pointer to a NodeInitializer object, e.g., inits::zeros()
+   * @param elementType a scoped enumerator (enum class) defines the element type, e.g., Type::float16
+   * @param fixed a bool type specifies whether the parameter object is fixed (not trainable) or not.
+   *        The default value is false which means the parameter is trainable.
+   * @return a pointer to the parameter node
+   */
  Expr param(const std::string& pname,
             const Shape& shape,
             const Ptr<inits::NodeInitializer>& init,
@ -354,6 +464,17 @@ public:
    return param(pname, shape, init, elementType, fixed, /*typeSpecified=*/true);
  }

+  /**
+   * Construct a parameter node in the graph without a specified type, and
+   * the type is set to defaultElementType_.
+   * @param pname a string type holds the name of the parameter node
+   * @param shape a struct type defines the shape of the parameter tensor
+   *        e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
+   * @param init a pointer to a NodeInitializer object, e.g., inits::zeros()
+   * @param fixed a bool type specifies whether the parameter object is fixed (not trainable) or not.
+   *        The default value is false which means the parameter is trainable.
+   * @return a pointer to the parameter node
+   */
  Expr param(const std::string& pname,
             const Shape& shape,
             const Ptr<inits::NodeInitializer>& init,
@ -362,28 +483,59 @@ public:
    return param(pname, shape, init, defaultElementType_, fixed, /*typeSpecified=*/false);
  }

+  /**
+   * Construct a constant node in the graph without a specified type, and
+   * the type is set to defaultElementType_.
+   * @param shape a struct type defines the shape of the constant tensor
+   *        e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
+   * @param init a pointer to a NodeInitializer object, e.g., inits::zeros()
+   * @param elementType a scoped enumerator (enum class) defines the element type, e.g., Type::float16
+   * @return a pointer to the constant node
+   */
  Expr constant(const Shape& shape,
                const Ptr<inits::NodeInitializer>& init,
                Type elementType) {
    return Expression<ConstantNode>(shared_from_this(), shape, init, elementType);
  }

+  /**
+   * Construct a constant node in the graph without a specified type, and
+   * the type is set to defaultElementType_.
+   * @param shape a struct type defines the shape of the constant tensor
+   *        e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
+   * @param init a pointer to a NodeInitializer object, e.g., inits::zeros()
+   * @return a pointer to the constant node
+   */
  Expr constant(const Shape& shape,
                const Ptr<inits::NodeInitializer>& init) {
    return Expression<ConstantNode>(shared_from_this(), shape, init, defaultElementType_);
  }

  // @TODO: add version with iterators
-  // shortcut to turn vector of indices to integer tensor, to be used with operators
-  // like rows or select
+  /**
+   * Turn vector of indices to integer tensor.
+   * A shortcut version to turn vector of indices to integer tensor, to be used with operators
+   * like rows() or index_select()
+   * @param indicesVector a vector of IndexType (uint32_t) specifies the indexes
+   */
  Expr indices(const std::vector<IndexType>& indicesVector) {
    return constant({(int)indicesVector.size()},
                    inits::fromVector(indicesVector),
                    Type::uint32);
  }
-  // this version sets up the shape such that the indices are in a given axis
-  // Use this if you want to pass these indices to gather().
-  // indexee shape = (3, 2, 5, 2); axis = 1 -> resulting shape = (1, size of indicesVector, 1, 1)
+
+  /**
+   * Specify the indexes of elements to be taken from a tensor.
+   * This version sets up the shape such that the indices are in a given axis.
+   * Use this if you want to pass these indices to gather().
+   * E.g., indexee shape = (3, 2, 5, 2); axis = 1 -> resulting shape = (1, size of indicesVector, 1, 1):
+   *  - The size of the resulting shape is the same as that of the indexee; here is 4.
+   *  - The shape of the specified axis is equal to the size of given indicesVector.
+   *  - The shapes of the rest axes are filled with 1.
+   * @param indicesVector a vector of IndexType (uint32_t) specifies the indexes
+   * @param indexee the source tensor that we want to select elements from
+   * @param axis specifies the axis that we want to collect along
+   */
  Expr indices(const std::vector<IndexType>& indicesVector, Expr indexee, int axis = -1) {
    Shape shape;
    shape.resize(indexee->shape().size());
@ -393,24 +545,70 @@ public:
                    Type::uint32);
  }

+  /**
+   * Construct a constant node filled with `1`.
+   * @param shape a struct type defines the shape of the constant dataset
+   *        e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
+   * @param elementType a scoped enumerator (enum class) defines the element type, e.g., Type::float16
+   */
  Expr ones(const Shape& shape, Type elementType) {
    return constant(shape, inits::ones(), elementType);
  }
+
+  /**
+   * Construct a constant node filled with `1` without a specified type,
+   * and the type is set to defaultElementType_.
+   * @param shape a struct type defines the shape of the constant dataset
+   *        e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
+   */
  Expr ones(const Shape& shape) {
    return constant(shape, inits::ones(), defaultElementType_);
  }

+  /**
+   * Construct a constant node filled with `0`.
+   * @param shape a struct type defines the shape of the constant dataset
+   *        e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
+   * @param elementType a scoped enumerator (enum class) defines the element type, e.g., Type::float16
+   */
  Expr zeros(const Shape& shape, Type elementType) {
    return constant(shape, inits::zeros(), elementType);
  }
+
+  /**
+   * Construct a constant node filled with `0` without a specified type,
+   * and the type is set to defaultElementType_.
+   * @param shape a struct type defines the shape of the constant dataset
+   *        e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
+   */
  Expr zeros(const Shape& shape) {
    return constant(shape, inits::zeros(), defaultElementType_);
  }

-  // prob = dropProb, e.g. 0.1 means 90% of values are kept
+  /**
+   * Construct a dropout mask (a tensor of 0 and 1).
+   * @param dropProb a float type specifies the dropout probability.
+   *        E.g., dropProb=0.1 means 90% of values are kept.
+   * @param shape a struct type defines the shape of the constant dataset
+   *        e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
+   * @param elementType a scoped enumerator (enum class) defines the element type, e.g., Type::float16
+   */
  Expr dropoutMask(float dropProb, const Shape& shape, Type elementType);
+
+  /**
+   * Construct a dropout mask (a tensor of 0 and 1) without a specified type,
+   * and the type is set to defaultElementType_.
+   * @param dropProb a float type specifies the dropout probability.
+   *        E.g., dropProb=0.1 means 90% of values are kept.
+   * @param shape a struct type defines the shape of the constant dataset
+   *        e.g., shape={2,3} means 2D matrix with dim[0]=2 and dim[1]=3
+   */
  Expr dropoutMask(float dropProb, const Shape& shape);

+  /**
+   * Get the parameter object by name.
+   * @param name a string specifies the name of the parameter object
+   */
  Expr get(std::string name) {
    if(!namespace_.empty())
      name = namespace_ + "::" + name;
@ -419,6 +617,11 @@ public:
    return p;
  }

+  /**
+   * Get the parameter object by name and type.
+   * @param name a string specifies the name of the parameter object
+   * @param elementType a scoped enumerator (enum class) defines the element type, e.g., Type::float16
+   */
  Expr get(std::string name, Type specifiedElementType) {
    if(!namespace_.empty())
      name = namespace_ + "::" + name;
@ -427,6 +630,10 @@ public:
    return p;
  }

+  /**
+   * Return the Parameters object related to the graph.
+   * The Parameters object holds the whole set of the parameter nodes.
+   */
  Ptr<Parameters>& params() { 
    // There are no parameter objects, that's weird.
    ABORT_IF(paramsByElementType_.empty(), "No parameter object has been created");
@ -441,6 +648,10 @@ public:
    return it->second;
  }

+  /**
+   * Set default element type for the graph.
+   * The default value is used if some node type is not specified.
+   */
  void setDefaultElementType(Type defaultElementType) {
    ABORT_IF(!paramsByElementType_.empty() && defaultElementType != defaultElementType_, 
             "Parameter objects already exist, cannot change default type from {} to {}", 
@ -448,33 +659,58 @@ public:
    defaultElementType_ = defaultElementType;
  }

+  /**
+   * Get default element type for the graph.
+   */
  Type getDefaultElementType() { return defaultElementType_; }

+  /**
+   * Add a expression node to the graph.
+   * @param node a pointer to a expression node
+   */
  Expr add(Expr node);

+  /**
+   * Allocate memory for the forward pass of the given node.
+   * @param node a pointer to a expression node
+   */
  void allocateForward(Expr node) {
    if(tensors_)
      tensors_->allocateForward(node);
  }

+  /**
+   * Allocate memory for the backward pass of the given node.
+   * @param node a pointer to a expression node
+   */
  void allocateBackward(Expr node) {
    if(tensors_)
      tensors_->allocateBackward(node);
  }

+  /**
+   * Free the memory for a tensor object.
+   * @param tensor a reference to the tensor object
+   */
  void free(const Tensor& tensor) {
    if(tensors_)
      tensors_->free(tensor);
  }

-  // Returns the memory allocator of the graph workspace, allocates row unstructured memory (but 256-byte aligned)
+  /**
+   * Returns the memory allocator of the graph workspace.
+   * Allocates raw unstructured memory (but 256-byte aligned).
+   */
  Ptr<Allocator> allocator() { return tensors_->getAllocator(); } // @TODO: rename this to getAllocator();

-  // Returns the tensor allocator of the graph workspace, different from above as proper tensor objects are allocated
+  /**
+   * Returns the tensor allocator of the graph workspace.
+   * Different from allocator() as proper tensor objects are allocated.
+   */
  Ptr<TensorAllocator> getTensorAllocator() { return tensors_->getTensorAllocator(); }

+  /** Clear everything apart from parameters and memoized nodes */
  void clear() {
-    // clear everything apart from parameters and memoized nodes
    count_ = 0;
    nodesForward_.clear();
    nodesBackward_.clear();
@ -484,13 +720,17 @@ public:
    tensors_->clear();
  }

+  /** Set the flag value whether the graph is reloaded (true) or not */
  void setReloaded(bool reloaded) { reloaded_ = reloaded; }

+  /** Set the flag value whether the graph throws a NaN exception (true) or not */
  void setThrowNaN(bool throwNaN) { throwNaN_ = throwNaN; }
+
+  /** Get the flag value whether the graph throws a NaN exception (true) or not */
  bool getThrowNaN() { return throwNaN_; }

 public:
-  // loading from array of io::Items
+  /** Load model (mainly parameter objects) from array of io::Items */
  void load(std::vector<io::Item>& ioItems, bool markReloaded = true) {
    setReloaded(false);
    for(auto& item : ioItems) {
@ -509,18 +749,24 @@ public:
      setReloaded(true);
  }

+  /** Load model by filename */
  void load(const std::string& name, bool markReloaded = true) {
    LOG(info, "Loading model from {}", name);
    auto items = io::loadItems(name);
    load(items, markReloaded);
  }

+  /** Load model from buffer (a file pointer) */
  void load(const void* ptr, bool markReloaded = true) {
    LOG(info, "Loading model from buffer at {}", ptr);
    auto items = io::loadItems(ptr);
    load(items, markReloaded);
  }

+  /**
+   * Turn the model (given a file pointer) into a memory-mapped type
+   * by converting all the parameter object to memory-mapped version, i.e., MappedParameters.
+   */
  void mmap(const void* ptr, bool markReloaded = true) {
    ABORT_IF(backend_->getDeviceId().type != DeviceType::cpu || !inferenceOnly_,
             "Memory mapping only supported for CPU inference mode");
@ -543,7 +789,6 @@ public:
      }
    }

-
    // pre-populate parameters by type
    for(auto& item : items) {
      auto it1 = paramsByElementType_.find(item.type);
@ -558,9 +803,19 @@ public:
  }

 public:
-  // convert all parameters into an array of io::Item elements, for saving
+  /**
+   * Convert all parameters into an array of io::Item elements, for saving.
+   * @param ioItems an array of io::Item elements
+   * @param saveElementType the element type for saving
+   */
  void save(std::vector<io::Item>& ioItems, Type saveElementType = Type::float32);

+  /**
+   * Save all parameters into a file (.npz or .bin).
+   * @param name a string specifies the filename
+   * @param meta a string specifies the name of io::Item elements. If not specified, the parameter name is reserved.
+   * @param saveElementType the element type for saving
+   */
  void save(const std::string& name, const std::string& meta = "", Type saveElementType = Type::float32) {
    std::vector<io::Item> ioItems;
    save(ioItems, saveElementType);
--- a/src/graph/expression_operators.cpp
+++ b/src/graph/expression_operators.cpp
@ -72,6 +72,14 @@ Expr sin(Expr a) {
  return Expression<SinNodeOp>(a);
 };

+Expr cos(Expr a) {
+  return Expression<CosNodeOp>(a);
+};
+
+Expr tan(Expr a) {
+  return Expression<TanNodeOp>(a);
+};
+
 Expr swish(Expr a) {
  return Expression<SwishNodeOp>(a);
 }
--- a/src/graph/expression_operators.h
+++ b/src/graph/expression_operators.h
--- a/src/graph/node.cpp
+++ b/src/graph/node.cpp
@ -27,11 +27,6 @@ void Node::free() {
  }
 }

-/**
- * Initialization for backward step of top node
- * in computation graph. Allocates memory and sets gradient
- * to 1 (df/df == 1).
- */
 void Node::init_dependent() {
  if(!adj_) {
    graph()->allocateBackward(this);
@ -39,12 +34,6 @@ void Node::init_dependent() {
  }
 }

-/**
- * Initialization for backward step of any non-top node
- * in computation graph. Allocates memory and sets gradient
- * to 0 for further accumulation of gradients from all
- * parents.
- */
 void Node::set_zero_adjoint() {
  if(!adj_) {
    graph()->allocateBackward(this);
--- a/src/graph/node.h
+++ b/src/graph/node.h
@ -28,13 +28,13 @@ protected:
  std::vector<Expr> children_;

  Weak<ExpressionGraph> graph_;
-  Shape shape_{1, 1, 1, 1};
-  Type valueType_{Type::float32};
+  Shape shape_{1, 1, 1, 1};         // defines the dimensionality of the node (for tensors)
+  Type valueType_{Type::float32};   // defines the element type of the node (for tensors)

  std::string name_{"none"};

-  Tensor val_{nullptr};
-  Tensor adj_{nullptr};
+  Tensor val_{nullptr};  // the resulting new tensor in forward pass
+  Tensor adj_{nullptr};  // the accumulated gradients (a tensor) in backward pass

  bool markedForDebug_{false};
  std::string debugMessage_;
@ -105,9 +105,19 @@ public:
  virtual void free() override;

  virtual void init() override {};
-
+  /**
+   * Initialization for backward step of top node
+   * in computation graph. Allocates memory and sets gradient
+   * to 1 (df/df == 1).
+   */
  virtual void init_dependent() override;

+  /**
+   * Initialization for backward step of any non-top node
+   * in computation graph. Allocates memory and sets gradient
+   * to 0 for further accumulation of gradients from all
+   * parents.
+   */
  virtual void set_zero_adjoint() override;

  virtual Tensor& val() override { return val_; };
--- a/src/graph/node_initializers.cpp
+++ b/src/graph/node_initializers.cpp
@ -98,9 +98,10 @@ Ptr<NodeInitializer> glorotUniform(bool fanIn, bool fanOut, float scalingFactor)
  return fromLambda([fanIn, fanOut, scalingFactor](Tensor t) {
    float scale = sqrtf(6.0f / (t->shape()[-2] + t->shape()[-1]));
    if(fanIn && !fanOut)
-      scale = sqrtf(3.0f / t->shape()[-2]); // results in columns of matrix to be ~unit length
+      scale = sqrtf(3.0f / t->shape()[-2]); // fanIn mode: the scale of tensor is adapted by the input variance
+                                            // results in columns of matrix to be ~unit range
    if(!fanIn && fanOut)
-      scale = sqrtf(3.0f / t->shape()[-1]);
+      scale = sqrtf(3.0f / t->shape()[-1]); // fanOut mode: the scale of tensor is adapted by the output variance

    scale *= scalingFactor;

@ -112,9 +113,9 @@ Ptr<NodeInitializer> glorotNormal(bool fanIn, bool fanOut, float scalingFactor)
  return fromLambda([fanIn, fanOut, scalingFactor](Tensor t) {
    float scale = sqrtf(2.0f / (t->shape()[-2] + t->shape()[-1]));
    if(fanIn && !fanOut)
-      scale = sqrtf(1.0f / t->shape()[-2]);
+      scale = sqrtf(1.0f / t->shape()[-2]); // fanIn mode: the scale of tensor is adapted by the input variance
    if(!fanIn && fanOut)
-      scale = sqrtf(1.0f / t->shape()[-1]);
+      scale = sqrtf(1.0f / t->shape()[-1]); // fanOut mode: the scale of tensor is adapted by the output variance

    scale *= scalingFactor;

@ -170,7 +171,7 @@ Ptr<NodeInitializer> fromWord2vec(const std::string& file,
                              bool normalize /*= false*/) {
  return fromLambda([file, dimVoc, dimEmb, normalize](Tensor t) {
    auto embs = Word2VecReader().read(file, dimVoc, dimEmb);
-    if(normalize) {
+    if(normalize) { // scaling to unit length:
      float norm = 0;
      for(auto e : embs)
        norm += e * e;
--- a/src/graph/node_initializers.h
+++ b/src/graph/node_initializers.h
@ -11,17 +11,18 @@
 namespace marian {

 class ExpressionGraph; // Forward declaration
-
+/**
+ * The namespace inits.
+ * Declare class NodeInitializer and all the available functions to initialise a node.
+*/
 namespace inits {

 /**
 * Base class for specialized NodeInitializers.
- *
 * A NodeInitializer is a functor that is associated with parameters
- * and constants, and is invoked on a tensor during node intialization.
- * You need to override NodeIntializer::apply(Tensor) with your own
- * functionality or use a fromLambda intializer.
- *
+ * and constants, and is invoked on a tensor during node initialization.
+ * You need to override NodeInitializer::apply(Tensor) with your own
+ * functionality or use a fromLambda initializer.
 * See node_initializers.cpp for examples.
 */
 class NodeInitializer {
@ -35,155 +36,242 @@ public:
 };

 /**
- * Use a lambda function of form [](Tensor t) { do something with t } to initalize tensor
+ * Use a lambda function of form [](Tensor t) { do something with t } to initialize tensor.
+ * @param func functor
 */
 Ptr<NodeInitializer> fromLambda(std::function<void(Tensor)>&& func);

 /**
- * Use a lambda function of form [](Tensor t) { do something with t } to initalize tensor
- * Create temporary tensor of Type intermediateType first, initialize and then copy/convert to actual Tensor
- * Useful for functions that can only operate on a specific type of tensor
+ * Use a lambda function of form [](Tensor t) { do something with t } to initialize tensor.
+ * Create temporary tensor of Type intermediateType first, initialize and then copy/convert to actual Tensor.
+ * Useful for functions that can only operate on a specific type of tensor.
 */
 Ptr<NodeInitializer> fromLambda(std::function<void(Tensor)>&& func, Type intermediateType);

 /**
- * Initialize tensor with given value
- *
- * Creates a NodeInitializer that will intialize the given tensor
+ * Initialize tensor with given value.
+ * Creates a NodeInitializer that will initialize the given tensor
 * with `value`. Works with any underlying numeric tensor type.
- *
 * @return A NodeInitializer
 */
 Ptr<NodeInitializer> fromValue(float value);

 /**
- * Fill tensor with `0`
- *
- * Creates a NodeInitializer that will intialize the given tensor
+ * Fill tensor with `0`.
+ * Creates a NodeInitializer that will initialize the given tensor
 * with `0`. Works with any underlying numeric tensor type.
- *
 * @return A NodeInitializer
 */
 static Ptr<NodeInitializer> zeros() { return fromValue(0.0f); }

 /**
- * Fill tensor with `1`
- *
- * Creates a NodeInitializer that will intialize the given tensor
+ * Fill tensor with `1`.
+ * Creates a NodeInitializer that will initialize the given tensor
 * with `1`. Works with any underlying numeric tensor type.
- *
 * @return A NodeInitializer
 */
 static Ptr<NodeInitializer> ones() { return fromValue(1.0f); }

 /**
 * Set diagonal of two dimensional quadratic matrix to `value`.
- *
- * Sets all values of the tensor to 0 and intializes the diagonal with
+ * Sets all values of the tensor to 0 and initializes the diagonal with
 * the given `value`. If no value is specified `1` is used by default.
- *
 * @return A NodeInitializer
 */
 Ptr<NodeInitializer> eye(float value = 1.f);

 /**
- * Intialize tensor with normally distributed random numbers
- *
- * Be default this generates floating point numbers from the
+ * Initialize tensor with normally distributed random numbers.
+ * By default this generates floating point numbers from the
 * normal distribution Normal(0, 1) unless specified differently.
- *
 * If compiled with `CUDA`, `marian` will use the `cuRand` library
 * for both, GPU and CPU computation. The random sequences generated
 * are the same on both devices.
- *
 * If `marian` is compiled without `CUDA`, a random generator
 * from the C++ standard library is used. These random generators
 * do not have the same random sequences.
- *
 * @return A NodeInitializer
 */
 Ptr<NodeInitializer> normal(float mean = 0.f, float stddev = 1.f);

 /**
- * Intialize tensor with uniformly distributed random numbers
- *
- * Be default this generates floating point numbers from the
+ * Initialize tensor with uniformly distributed random numbers.
+ * By default this generates floating point numbers from the
 * uniform distribution Uniform(0, 1) unless specified differently.
- *
 * If compiled with `CUDA`, `marian` will use the `cuRand` library
 * for both, GPU and CPU computation. The random sequences generated
 * are the same on both devices.
- *
 * If `marian` is compiled without `CUDA`, a random generator
 * from the C++ standard library is used. These random generators
 * do not have the same random sequences.
- *
+ * @param a the lower bound of interval
+ * @param b the upper bound of interval
 * @return A NodeInitializer
 */
 Ptr<NodeInitializer> uniform(float a = 0.f, float b = 1.f);

-// @TODO: add documentation
+/**
+ * Initialize tensor with random numbers from Bernoulli Distribution.
+ * The Bernoulli distribution is the discrete probability distribution of
+ * a random variable which takes value `1` with probability p, and
+ * value `0` with probability (1-p).
+ * By default this function generates a tensor of 0 and 1 with probability p
+ * if bernoulli(p) is called. We offer `scale` and `shift` parameters which
+ * can map {0,1} to {0,1}*`scale`+`shift`.
+ * E.g., bernoulli(tensor, 0.5f, 2.f, -1.f) where p=0.5f, scale=2.f, shift=-1.f.
+ * {0,1} is mapped to {0,1}*2+(-1)= {-1,1}. It generates a tensor composed of
+ * 50% of 1 and 50% of -1.
+ * @return A NodeInitializer
+ */
 Ptr<NodeInitializer> bernoulli(float p, float scale = 1.f, float shift = 0.f);

-// @TODO: add documentation
+/**
+ * Initialize tensor with random numbers from Glorot uniform distribution.
+ * The <a href=http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>Glorot uniform</a>,
+ * also called Xavier uniform, is designed to keep the scale of
+ * the gradients roughly the same in all layers.
+ * This function offers three variants (modes).
+ * The values of the tensor is sampled from Uniform(-x*scale, x*scale):
+ *   - when fanIn=false and fanOut=false (by default):
+ *      x = sqrt(6 / (in + out))
+ *   - when fanIn=true and fanOut=false (fanIn mode):
+ *      x = sqrt(3 / in)
+ *   - when fanIn=false and fanOut=false (fanOut mode):
+ *      x = sqrt(3 / out)
+ * where `in` is the number of input units in the tensor, `out` is the number of output units.
+ * `scale` is used to change the range of Uniform distribution.
+ * @return A NodeInitializer
+ */
 Ptr<NodeInitializer> glorotUniform(bool fanIn = false, bool fanOut = false, float scale = 1.f);

-// @TODO: add documentation
+/**
+ * Initialize tensor with random numbers from Glorot Normal distribution.
+ * Similar to function glorotUniform(), this function adopts Normal distribution instead of
+ * uniform distribution.
+ * This function offers three variants (modes).
+ * The values of the tensor is sampled from Normal(-x*scale, x*scale):
+ *   - when fanIn=false and fanOut=false (by default):
+ *      x = sqrt(2 / (in + out))
+ *   - when fanIn=true and fanOut=false (fanIn mode):
+ *      x = sqrt(1 / in)
+ *   - when fanIn=false and fanOut=false (fanOut mode):
+ *      x = sqrt(1 / out)
+ * where `in` is the number of input units in the tensor, `out` is the number of output units.
+ * `scale` is used to change the range of Normal distribution.
+ * @return A NodeInitializer
+ */
 Ptr<NodeInitializer> glorotNormal(bool fanIn = false, bool fanOut = false, float scale = 1.f);

-// @TODO: add documentation
-Ptr<NodeInitializer> dropout(float dropoutProbabilty);
+/**
+ * Initialize a dropout mask (a tensor of 0 and 1) with given dropout probability.
+ * <a href=https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf>Dropout</a>
+ * is proposed as a technique to prevent Neural Networks from overfitting.
+ * @param dropoutProbability a float type defines the dropout probability.
+ *        E.g., dropoutProbability=0.1 means 90% of values are kept.
+ * @return A NodeInitializer
+ */
+Ptr<NodeInitializer> dropout(float dropoutProbability);

 /**
- * Intialize with gumbel noise, i.e. -log(-log(u)) where u ~ Uniform(0 + eps, 1 - eps)
- *
+ * Initialize with gumbel noise, i.e. -log(-log(u)) where u ~ Uniform(0 + eps, 1 - eps).
+ * @param eps a variable protects from log(0)
 * @return A NodeInitializer
 */
 Ptr<NodeInitializer> gumbel(float eps = 1e-5f);

-// @TODO: add documentation
+/**
+ * Initialize tensor by *copying* from the given vector.
+ * Creates a NodeInitializer that will initialize the tensor
+ * by *copying* the values from the given vector
+ * @param v vector
+ * @return A NodeInitializer
+ */
 template <typename T>
 Ptr<NodeInitializer> fromVector(const std::vector<T>& v);
+
+/**
+ * Initialize tensor by *moving* from the given vector.
+ * Creates a NodeInitializer that will initialize the tensor by *moving* the values
+ * from the given vector into this tensor, and the given vector may be emptied.
+ * This version is the <a href=https://en.cppreference.com/w/cpp/language/reference>
+ * rvalue reference</a> overloading.
+ * @param v vector
+ * @return A NodeInitializer
+ */
 template <typename T>
 Ptr<NodeInitializer> fromVector(std::vector<T>&& v);

-// @TODO: add documentation
+/**
+ * Initialize tensor from a given sparse vector.
+ * Creates a NodeInitializer that will initialize the tensor from a given
+ * sparse vector (stored in std::pair). The resulting tensor is first filled
+ * with `1e-6` (a placeholder for non-zero element), then set the value to
+ * the given sparse vector.
+ * @param v the sparse vector is stored in `std::pair`:
+ *   - the first object (v.first) holds the indexes (in a vector)
+ *   - the second object (v.second) holds the corresponding values (in a vector).
+ *   This means the value of the resulting tensor at index v.first[i] is v.second[i].
+ * @return A NodeInitializer
+ */
 Ptr<NodeInitializer> fromSparseVector(std::pair<std::vector<size_t>, std::vector<float>>& v);

-// @TODO: add documentation
+/**
+ * Initialize tensor by copying from the given io::Item.
+ * Creates a NodeInitializer that will initialize the tensor by copying the values
+ * from the given io::Item. If this io::Item is a memory-mapped item, then the
+ * function will set the memory region pointing to this item. If this io::Item is
+ * a regular item, then the function will copy the values from this item.
+ * @return A NodeInitializer
+ */
 Ptr<NodeInitializer> fromItem(const io::Item& item);

-// @TODO: add documentation
+/**
+ * Initialize tensor by copying from the given tensor.
+ * Creates a NodeInitializer that will initialize the tensor
+ * by copying the values from the given tensor.
+ * @return A NodeInitializer
+ */
 Ptr<NodeInitializer> fromTensor(Tensor tensor);

-// @TODO: add documentation
+/**
+ * Initialize tensor from a file.
+ * Creates a NodeInitializer that will initialize the tensor
+ * by copying the values from the given file. This function is
+ * mainly used for loading embedding vectors from a file.
+ * @param file filename
+ * @param dimVoc the number of words in the vocabulary
+ * @param dimEmb the length of embedding vectors
+ * @param normalize a flag holds whether the values are normalize.
+ * Here we adopt the method of <a
+ * href=https://en.wikipedia.org/wiki/Feature_scaling#Scaling_to_unit_length>
+ * scaling to unit length</a>, i.e., dividing each element by the Euclidean length of the vector.
+ * @return A NodeInitializer
+ */
 Ptr<NodeInitializer> fromWord2vec(const std::string& file,
                                  int dimVoc,
                                  int dimEmb,
                                  bool normalize = false);

 /**
+ * Computes Google's sinusoidal position embeddings.
 * Computes Google's Transformer-style sinusoidal position embeddings
 * starting from position 'start' taking into account batch and time
- * dimensions of the tensor.
- *
- * Expected tensor layout {-2: time, -1: model}
- *
- * Usually gets later reshaped to {time, 1, model} and
- * added with a broadcast to learned embeddings. Positional
- * embeddings are the same for each batch entry and change
- * over time steps.
+ * dimensions of the tensor. Expected tensor layout {-2: time, -1: model}.
+ * Usually gets later reshaped to {time, 1, model} and added with a broadcast
+ * to learned embeddings. Positional embeddings are the same for each batch
+ * entry and change over time steps.
 */
 Ptr<NodeInitializer> sinusoidalPositionEmbeddings(int start);

 /**
- * Computes a random rotation matrix for LSH hashing. This is part  
- * of a hash function. The values are orthonormal and computed via
+ * Computes a random rotation matrix for LSH hashing.
+ * This is part of a hash function. The values are orthonormal and computed via
 * QR decomposition. Same seed results in same random rotation.
 */
 Ptr<NodeInitializer> randomRotation(size_t seed = Config::seed);

 /**
+ * Computes the equivalent of Python's range().
 * Computes a range from begin to end-1, like Python's range().
 * The constant being initialized must have one dimension that matches
 * the number of elements being generated, while any other dimension must be 1.
--- a/src/graph/node_operators.h
+++ b/src/graph/node_operators.h
@ -5,7 +5,13 @@
 #include "tensors/tensor.h"

 namespace marian {
-
+/**
+ *  A constant node for the graph.
+ *  A constant node is actually a constant tensor whose value is
+ *  immutable during the training. ConstantNode instance is usually
+ *  used as the inputs. To construct a constant node in the
+ *  graph, we use constant() function in ExpressionGraph class.
+ */
 struct ConstantNode : public Node {
  ConstantNode(Ptr<ExpressionGraph> graph,
               const Shape& shape,
@ -35,7 +41,13 @@ private:
  Ptr<inits::NodeInitializer> init_;
  bool initialized_;
 };
-
+/**
+ * A parameter node for the graph.
+ * A parameter node is used to store model parameters whose value can be
+ * changed during the training, such as weights and biases. To construct
+ * a parameter node in the graph, we use param() function in
+ * ExpressionGraph class.
+ */
 struct ParamNode : public Node {
  ParamNode(Ptr<ExpressionGraph> graph,
            const Shape& shape,
--- a/src/graph/node_operators_unary.h
+++ b/src/graph/node_operators_unary.h
@ -646,7 +646,7 @@ struct CosNodeOp : public UnaryNodeOp {
    return {NodeOp(Add(_1 * -sin(_2), child(0)->grad(), adj_, child(0)->val()))};
  }

-  const std::string type() override { return "sin"; }
+  const std::string type() override { return "cos"; }
 };

 struct TanNodeOp : public UnaryNodeOp {
@ -662,7 +662,7 @@ struct TanNodeOp : public UnaryNodeOp {
    return {NodeOp(Add(_1 / sqr(cos(_2)), child(0)->grad(), adj_, child(0)->val()))};
  }

-  const std::string type() override { return "sin"; }
+  const std::string type() override { return "tan"; }
 };

 struct SqrtNodeOp : public UnaryNodeOp {
--- a/src/tensors/gpu/add.inc
+++ b/src/tensors/gpu/add.inc
@ -37,3 +37,5 @@ template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functio
 template void marian::gpu::Add<marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase> >(marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
 template void marian::gpu::Aggregate<marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::Assignee<1> >, marian::functional::BinaryFunctor<marian::functional::elem::Max, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, IntrusivePtr<marian::TensorBase> >(marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::Assignee<1> >, float, marian::functional::BinaryFunctor<marian::functional::elem::Max, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
 template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult,marian::functional::Assignee<1>,marian::functional::UnaryFunctor<marian::functional::elem::Cos,marian::functional::Assignee<2> > >,class IntrusivePtr<class marian::TensorBase>,class IntrusivePtr<class marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Mult,marian::functional::Assignee<1>,marian::functional::UnaryFunctor<marian::functional::elem::Cos,marian::functional::Assignee<2> > >,float,class IntrusivePtr<class marian::TensorBase>,class IntrusivePtr<class marian::TensorBase>,class IntrusivePtr<class marian::TensorBase>);
+template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::UnaryFunctor<marian::functional::elem::Sin, marian::functional::Assignee<2> > > >, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::UnaryFunctor<marian::functional::elem::Sin, marian::functional::Assignee<2> > > >, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > > >, marian::Tensor, marian::Tensor >(marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > > >, float, marian::Tensor, marian::Tensor, marian::Tensor);
--- a/src/tensors/gpu/add_all.inc
+++ b/src/tensors/gpu/add_all.inc
@ -37,6 +37,9 @@ template void marian::AggregateAll<float, float, marian::functional::BinaryFunct
 template void marian::AggregateAll<float, float, marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
 template void marian::AggregateAll<float,float,marian::functional::BinaryFunctor<marian::functional::elem::Mult,marian::functional::Assignee<1>,marian::functional::UnaryFunctor<marian::functional::elem::Cos,marian::functional::Assignee<2> > >,marian::functional::BinaryFunctor<marian::functional::elem::Plus,marian::functional::Assignee<1>,marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>,marian::functional::BinaryFunctor<marian::functional::elem::Mult,marian::functional::Assignee<1>,marian::functional::UnaryFunctor<marian::functional::elem::Cos,marian::functional::Assignee<2> > >,float,marian::functional::BinaryFunctor<marian::functional::elem::Plus,marian::functional::Assignee<1>,marian::functional::Assignee<2> >,float,IntrusivePtr<marian::TensorBase>,IntrusivePtr<marian::TensorBase>,IntrusivePtr<marian::TensorBase>);
 template void marian::AggregateAll<float, float, marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::Assignee<1> >, marian::functional::BinaryFunctor<marian::functional::elem::Max, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::Assignee<1> >, float, marian::functional::BinaryFunctor<marian::functional::elem::Max, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
+template void marian::AggregateAll<float, float, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::UnaryFunctor<marian::functional::elem::Sin, marian::functional::Assignee<2> > > >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::UnaryFunctor<marian::functional::elem::Sin, marian::functional::Assignee<2> > > >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void marian::AggregateAll<float, float, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > > >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > > >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor);
+
 #if COMPILE_FP16
 template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
 template void AggregateAll<__half, float, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Div, Capture, Assignee<1>>>, Assignee<2>>, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>>(std::shared_ptr<Allocator>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Div, Capture, Assignee<1>>>, Assignee<2>>, float, BinaryFunctor<elem::Plus, Assignee<1>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
@ -75,4 +78,6 @@ template void marian::AggregateAll<__half, float, marian::functional::BinaryFunc
 template void marian::AggregateAll<__half, float, marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
 template void marian::AggregateAll<__half,float,marian::functional::BinaryFunctor<marian::functional::elem::Mult,marian::functional::Assignee<1>,marian::functional::UnaryFunctor<marian::functional::elem::Cos,marian::functional::Assignee<2> > >,marian::functional::BinaryFunctor<marian::functional::elem::Plus,marian::functional::Assignee<1>,marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>,marian::functional::BinaryFunctor<marian::functional::elem::Mult,marian::functional::Assignee<1>,marian::functional::UnaryFunctor<marian::functional::elem::Cos,marian::functional::Assignee<2> > >,float,marian::functional::BinaryFunctor<marian::functional::elem::Plus,marian::functional::Assignee<1>,marian::functional::Assignee<2> >,float,IntrusivePtr<marian::TensorBase>,IntrusivePtr<marian::TensorBase>,IntrusivePtr<marian::TensorBase>);
 template void marian::AggregateAll<__half, float, marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::Assignee<1> >, marian::functional::BinaryFunctor<marian::functional::elem::Max, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::Assignee<1> >, float, marian::functional::BinaryFunctor<marian::functional::elem::Max, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
+template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::UnaryFunctor<marian::functional::elem::Sin, marian::functional::Assignee<2> > > >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Neg, marian::functional::UnaryFunctor<marian::functional::elem::Sin, marian::functional::Assignee<2> > > >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor);
+template void marian::AggregateAll<__half, float, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > > >, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> > >(std::shared_ptr<marian::Allocator>, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqr, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > > >, float, marian::functional::BinaryFunctor<marian::functional::elem::Plus, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, marian::Tensor, marian::Tensor, marian::Tensor);
 #endif
--- a/src/tensors/gpu/element.inc
+++ b/src/tensors/gpu/element.inc
@ -68,6 +68,8 @@ template void marian::gpu::Element<marian::functional::Assign<marian::functional
 template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::UnaryFunctor<marian::functional::elem::Floor, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::Log, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<2>, marian::functional::Capture> >, marian::functional::Capture> >, marian::functional::UnaryFunctor<marian::functional::elem::Log, marian::functional::Capture> > > >, IntrusivePtr<marian::TensorBase> >(marian::functional::Assign<marian::functional::Var<1>, marian::functional::UnaryFunctor<marian::functional::elem::Floor, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::Log, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<2>, marian::functional::Capture> >, marian::functional::Capture> >, marian::functional::UnaryFunctor<marian::functional::elem::Log, marian::functional::Capture> > > >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
 template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Pow, marian::functional::Capture, marian::functional::Assignee<1> >, marian::functional::Capture>, marian::functional::UnaryFunctor<marian::functional::elem::Sgn, marian::functional::Assignee<2> > > >, IntrusivePtr<marian::TensorBase> >(marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Pow, marian::functional::Capture, marian::functional::Assignee<1> >, marian::functional::Capture>, marian::functional::UnaryFunctor<marian::functional::elem::Sgn, marian::functional::Assignee<2> > > >, IntrusivePtr<marian::TensorBase>, IntrusivePtr<marian::TensorBase>);
 template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Sgn, marian::functional::Assignee<1> >, marian::functional::Capture>, marian::functional::BinaryFunctor<marian::functional::elem::Pow, marian::functional::Capture, marian::functional::BinaryFunctor<marian::functional::elem::Clip, marian::functional::UnaryFunctor<marian::functional::elem::Floor, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::Log, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::Capture> >, marian::functional::Capture> >, marian::functional::UnaryFunctor<marian::functional::elem::Log, marian::functional::Capture> > >, marian::functional::Capture> > > >>(marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Sgn, marian::functional::Assignee<1> >, marian::functional::Capture>, marian::functional::BinaryFunctor<marian::functional::elem::Pow, marian::functional::Capture, marian::functional::BinaryFunctor<marian::functional::elem::Clip, marian::functional::UnaryFunctor<marian::functional::elem::Floor, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::UnaryFunctor<marian::functional::elem::Log, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::UnaryFunctor<marian::functional::elem::Abs, marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::Assignee<1>, marian::functional::Capture> >, marian::functional::Capture> >, marian::functional::UnaryFunctor<marian::functional::elem::Log, marian::functional::Capture> > >, marian::functional::Capture> > > >, IntrusivePtr<marian::TensorBase>);
+template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > >, marian::Tensor >(marian::functional::Assign<marian::functional::Var<1>, marian::functional::UnaryFunctor<marian::functional::elem::Cos, marian::functional::Assignee<2> > >, marian::Tensor, marian::Tensor);
+template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::UnaryFunctor<marian::functional::elem::Tan, marian::functional::Assignee<2> > >, marian::Tensor >(marian::functional::Assign<marian::functional::Var<1>, marian::functional::UnaryFunctor<marian::functional::elem::Tan, marian::functional::Assignee<2> > >, marian::Tensor, marian::Tensor);
 // How to add new specializations:
 // When you use a new specialization, it will cause a link error of this form (example):
 //   .../src/tensors/tensor_operators.h:41: undefined reference to `void marian::gpu::Element<marian::functional::Assign< ... > ( ... )'
--- a/src/tensors/tensor.h
+++ b/src/tensors/tensor.h
@ -21,6 +21,12 @@ namespace io {
  struct Item;
 }

+/**
+ * Main implementation of a <a href="https://en.wikipedia.org/wiki/Tensor">tensor</a>,
+ * a multi-dimensional matrix containing elements of a single data type.
+ * TensorBase contains the data, data type, pointer to
+ * memory region, shape, backend info and other attributes.
+ */
 class TensorBase {
  MemoryPiece::PtrType memory_;
  Shape shape_;