diff --git a/.gitignore b/.gitignore
index f78c8028..53468680 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,6 +40,3 @@ build
 # Examples
 examples/*/*.gz
 examples/mnist/*ubyte
-
-.cproject
-.project
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 28287da1..0ad27c34 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,7 @@ project(marian CXX)
 find_package(CUDA "8.0" REQUIRED)
 
 SET(CMAKE_CXX_FLAGS " -std=c++11 -g -O3 -Wno-unused-result -Wno-deprecated -fPIC -Wno-deprecated-gpu-targets")
-LIST(APPEND CUDA_NVCC_FLAGS -std=c++11; --default-stream per-thread; -g; -O3; --use_fast_math; -Xcompiler '-fPIC'; -arch=sm_35; -DCUDNN)
+LIST(APPEND CUDA_NVCC_FLAGS -std=c++11; --default-stream per-thread; -g; -O3; --use_fast_math; -Xcompiler '-fPIC'; -arch=sm_35;)
 SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
 
 include_directories(${amunn_SOURCE_DIR})
diff --git a/README.md b/README.md
index 502df966..c4b5fa1a 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,8 @@
 Marian
 ======
 
-[![Join the chat at https://gitter.im/MarianNMT/Lobby](https://badges.gitter.im/MarianNMT/Lobby.svg)](https://gitter.im/MarianNMT/Lobby?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
+[![Join the chat at https://gitter.im/amunmt/marian](https://badges.gitter.im/amunmt/marian.svg)](https://gitter.im/amunmt/marian?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
+[![Build Status](http://vali.inf.ed.ac.uk/jenkins/buildStatus/icon?job=Marian)](http://vali.inf.ed.ac.uk/jenkins/job/Marian/)
 
 Google group for commit messages: https://groups.google.com/forum/#!forum/mariannmt
 
@@ -17,30 +18,12 @@ Installation
 Requirements:
 
 * g++ with c++11
-* CUDA and CuDNN
+* CUDA
 * Boost (>= 1.56)
 
-Exporting some paths for CuDNN may be required (put it, for example, in your `.bashrc` file):
-
-    export PATH=$PATH:$HOME/.local/bin:/usr/local/cuda/bin
-    export LIBRARY_PATH=$LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/local/cudnn-5/lib64
-    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/local/cudnn-5/lib64
-    export CPATH=$CPATH:/usr/local/cudnn-5/include
-
 Compilation with `cmake > 3.5`:
 
     mkdir build
     cd build
     cmake ..
     make -j
-
-To compile API documentation using Doxygen, first cd to the build directory, and then:
-
-    make doc
-
-To test, first compile, then:
-
-    cd examples/mnist
-    make
-    cd ../../build
-    ./mnist_benchmark
diff --git a/marian/.cproject b/marian/.cproject
new file mode 100644
index 00000000..184c39a4
--- /dev/null
+++ b/marian/.cproject
@@ -0,0 +1,163 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
+	<storageModule moduleId="org.eclipse.cdt.core.settings">
+		<cconfiguration id="com.nvidia.cuda.ide.seven_five.configuration.debug.1479727693">
+			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="com.nvidia.cuda.ide.seven_five.configuration.debug.1479727693" moduleId="org.eclipse.cdt.core.settings" name="Debug">
+				<externalSettings/>
+				<extensions>
+					<extension id="com.nvidia.cuda.ide.elf" point="org.eclipse.cdt.core.BinaryParser"/>
+					<extension id="com.nvidia.cuda.ide.cubin" point="org.eclipse.cdt.core.BinaryParser"/>
+					<extension id="com.nvidia.cuda.ide.macho" point="org.eclipse.cdt.core.BinaryParser"/>
+					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.VCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="nvcc.errorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+				</extensions>
+			</storageModule>
+			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+				<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe,org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug" cleanCommand="rm -rf" description="" id="com.nvidia.cuda.ide.seven_five.configuration.debug.1479727693" name="Debug" parent="com.nvidia.cuda.ide.seven_five.configuration.debug">
+					<folderInfo id="com.nvidia.cuda.ide.seven_five.configuration.debug.1479727693." name="/" resourcePath="">
+						<toolChain id="com.nvidia.cuda.tools.toolchain.seven_five.exe.debug.1735809242" name="CUDA Toolkit 8.0" superClass="com.nvidia.cuda.tools.toolchain.seven_five.exe.debug">
+							<targetPlatform archList="all" binaryParser="com.nvidia.cuda.ide.elf;com.nvidia.cuda.ide.macho;com.nvidia.cuda.ide.cubin" id="com.nvidia.cuda.ide.targetPlatform.1814841241" isAbstract="false" name="Debug Platform" osList="linux,macosx" superClass="com.nvidia.cuda.ide.targetPlatform"/>
+							<builder buildPath="${workspace_loc:/marian}/Debug" id="com.nvidia.cuda.ide.builder.466223137" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="CUDA Toolkit 8.0 Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="com.nvidia.cuda.ide.builder"/>
+							<tool id="nvcc.compiler.base.1979453423" name="NVCC Compiler" superClass="nvcc.compiler.base">
+								<option id="nvcc.compiler.deviceDebug.188182034" name="Generate device debug information (-G)" superClass="nvcc.compiler.deviceDebug" value="true" valueType="boolean"/>
+								<option id="nvcc.compiler.option.level.1731110905" name="Generate host debug information (-g)" superClass="nvcc.compiler.option.level" value="true" valueType="boolean"/>
+								<option defaultValue="nvcc.compiler.optimization.level.none" id="nvcc.compiler.optimization.level.1954677201" name="Optimization Level" superClass="nvcc.compiler.optimization.level" valueType="enumerated"/>
+								<option id="nvcc.compiler.pic.533579278" name="Position Independent Code (-fPIC)" superClass="nvcc.compiler.pic"/>
+								<option id="nvcc.compiler.include.paths.1654919056" name="Include paths (-I)" superClass="nvcc.compiler.include.paths" valueType="includePath">
+									<listOptionValue builtIn="false" value="&quot;${workspace_loc:/}/boost/include&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${workspace_loc:/}/src/3rd_party&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${workspace_loc:/}/src&quot;"/>
+								</option>
+								<option id="nvcc.compiler.cpp11option.1114260643" name="Enable C++11 support (-std=c++11)" superClass="nvcc.compiler.cpp11option" value="true" valueType="boolean"/>
+								<inputType id="nvcc.compiler.input.cu.111060846" superClass="nvcc.compiler.input.cu"/>
+								<inputType id="nvcc.compiler.input.cpp.945692641" superClass="nvcc.compiler.input.cpp"/>
+								<inputType id="nvcc.compiler.input.c.749588226" superClass="nvcc.compiler.input.c"/>
+							</tool>
+							<tool id="nvcc.linker.base.635344589" name="NVCC Linker" superClass="nvcc.linker.base">
+								<option id="nvcc.linker.option.libs.1878015233" name="Libraries (-l)" superClass="nvcc.linker.option.libs" valueType="libs">
+									<listOptionValue builtIn="false" value="boost_chrono"/>
+									<listOptionValue builtIn="false" value="boost_iostreams"/>
+									<listOptionValue builtIn="false" value="boost_filesystem"/>
+									<listOptionValue builtIn="false" value="boost_program_options"/>
+									<listOptionValue builtIn="false" value="boost_system"/>
+									<listOptionValue builtIn="false" value="boost_timer"/>
+									<listOptionValue builtIn="false" value="cudnn"/>
+									<listOptionValue builtIn="false" value="cuda"/>
+									<listOptionValue builtIn="false" value="cublas"/>
+									<listOptionValue builtIn="false" value="z"/>
+								</option>
+								<option id="nvcc.linker.option.paths.1326041662" name="Library search path (-L)" superClass="nvcc.linker.option.paths" valueType="libPaths">
+									<listOptionValue builtIn="false" value="&quot;${workspace_loc:/}/boost/lib64&quot;"/>
+									<listOptionValue builtIn="false" value="/usr/local/cuda/lib"/>
+									<listOptionValue builtIn="false" value="/usr/lib"/>
+								</option>
+								<inputType id="nvcc.linker.input.1742167733" superClass="nvcc.linker.input">
+									<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
+									<additionalInput kind="additionalinput" paths="$(LIBS)"/>
+								</inputType>
+							</tool>
+							<tool id="nvcc.archiver.base.1766259627" name="NVCC Archiver" superClass="nvcc.archiver.base"/>
+							<tool id="com.nvidia.host.assembler.1563873432" name="Host Assembler" superClass="com.nvidia.host.assembler">
+								<inputType id="cdt.managedbuild.tool.gnu.assembler.input.191093879" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+							</tool>
+						</toolChain>
+					</folderInfo>
+					<fileInfo id="com.nvidia.cuda.ide.seven_five.configuration.debug.1479727693.2014034551" name="npz_converter.h" rcbsApplicability="disable" resourcePath="src/data/npz_converter.h" toolsToInvoke=""/>
+					<sourceEntries>
+						<entry excluding="src/test|src/3rd_party/spdlog/details/format.cc|src/data/npz_converter.h|src/data/npz_converter.cpp|src/xor.cu|src/tensor_test.cu|src/rnn_test.cu|src/test_nodes.cu|src/nematus_test.cu|src/tensors/bac|src/softmax_benchmark.cu|src/mnist_benchmark.cu|src/validate_encoder_decoder.cu|src/test.cu|src/validate_mnist_batch.cu|src/train_mnist.cu|src/validate_mnist.cu|src/npz_converter.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
+					</sourceEntries>
+				</configuration>
+			</storageModule>
+			<storageModule moduleId="com.nvidia.cuda.ide.build.project.ICudaProjectConfiguration">
+				<executable devicelink="false">
+					<sass major="2" minor="0"/>
+					<ptx major="2" minor="0"/>
+				</executable>
+				<editor-arch major="2" minor="0"/>
+			</storageModule>
+			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
+		</cconfiguration>
+		<cconfiguration id="com.nvidia.cuda.ide.seven_five.configuration.release.77237983">
+			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="com.nvidia.cuda.ide.seven_five.configuration.release.77237983" moduleId="org.eclipse.cdt.core.settings" name="Release">
+				<externalSettings/>
+				<extensions>
+					<extension id="com.nvidia.cuda.ide.elf" point="org.eclipse.cdt.core.BinaryParser"/>
+					<extension id="com.nvidia.cuda.ide.cubin" point="org.eclipse.cdt.core.BinaryParser"/>
+					<extension id="com.nvidia.cuda.ide.macho" point="org.eclipse.cdt.core.BinaryParser"/>
+					<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.VCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="nvcc.errorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+					<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
+				</extensions>
+			</storageModule>
+			<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+				<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="com.nvidia.cuda.ide.seven_five.configuration.release.77237983" name="Release" parent="com.nvidia.cuda.ide.seven_five.configuration.release">
+					<folderInfo id="com.nvidia.cuda.ide.seven_five.configuration.release.77237983." name="/" resourcePath="">
+						<toolChain id="com.nvidia.cuda.ide.toolchain.seven_five.exe.release.537573056" name="CUDA Toolkit 8.0" superClass="com.nvidia.cuda.ide.toolchain.seven_five.exe.release">
+							<targetPlatform archList="all" binaryParser="com.nvidia.cuda.ide.elf;com.nvidia.cuda.ide.macho;com.nvidia.cuda.ide.cubin" id="com.nvidia.cuda.ide.targetPlatform.1603968154" isAbstract="false" name="Debug Platform" osList="linux,macosx" superClass="com.nvidia.cuda.ide.targetPlatform"/>
+							<builder buildPath="${workspace_loc:/marian}/Release" id="com.nvidia.cuda.ide.builder.1512078117" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="CUDA Toolkit 8.0 Builder" superClass="com.nvidia.cuda.ide.builder"/>
+							<tool id="nvcc.compiler.base.6717312" name="NVCC Compiler" superClass="nvcc.compiler.base">
+								<option id="nvcc.compiler.deviceDebug.98602926" name="Generate device debug information (-G)" superClass="nvcc.compiler.deviceDebug"/>
+								<option id="nvcc.compiler.option.level.902202019" name="Generate host debug information (-g)" superClass="nvcc.compiler.option.level"/>
+								<option defaultValue="nvcc.compiler.optimization.level.most" id="nvcc.compiler.optimization.level.929501471" name="Optimization Level" superClass="nvcc.compiler.optimization.level" valueType="enumerated"/>
+								<option id="nvcc.compiler.pic.1429189596" name="Position Independent Code (-fPIC)" superClass="nvcc.compiler.pic"/>
+								<inputType id="nvcc.compiler.input.cu.168631664" superClass="nvcc.compiler.input.cu"/>
+								<inputType id="nvcc.compiler.input.cpp.2051297104" superClass="nvcc.compiler.input.cpp"/>
+								<inputType id="nvcc.compiler.input.c.1492088925" superClass="nvcc.compiler.input.c"/>
+							</tool>
+							<tool id="nvcc.linker.base.1475934167" name="NVCC Linker" superClass="nvcc.linker.base">
+								<inputType id="nvcc.linker.input.1739308440" superClass="nvcc.linker.input">
+									<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
+									<additionalInput kind="additionalinput" paths="$(LIBS)"/>
+								</inputType>
+							</tool>
+							<tool id="nvcc.archiver.base.1045271474" name="NVCC Archiver" superClass="nvcc.archiver.base"/>
+							<tool id="com.nvidia.host.assembler.1853273636" name="Host Assembler" superClass="com.nvidia.host.assembler">
+								<inputType id="cdt.managedbuild.tool.gnu.assembler.input.832456357" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
+							</tool>
+						</toolChain>
+					</folderInfo>
+				</configuration>
+			</storageModule>
+			<storageModule moduleId="com.nvidia.cuda.ide.build.project.ICudaProjectConfiguration">
+				<executable devicelink="false">
+					<sass major="2" minor="0"/>
+					<ptx major="2" minor="0"/>
+				</executable>
+			</storageModule>
+			<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
+		</cconfiguration>
+	</storageModule>
+	<storageModule moduleId="cdtBuildSystem" version="4.0.0">
+		<project id="marian.com.nvidia.cuda.ide.seven_five.exe.198591110" name="Executable" projectType="com.nvidia.cuda.ide.seven_five.exe"/>
+	</storageModule>
+	<storageModule moduleId="scannerConfiguration">
+		<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
+		<scannerConfigBuildInfo instanceId="com.nvidia.cuda.ide.seven_five.configuration.debug.1479727693;com.nvidia.cuda.ide.seven_five.configuration.debug.1479727693.;nvcc.compiler.base.1979453423;nvcc.compiler.input.cu.111060846">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="com.nvidia.cuda.ide.build.NVCCPerProjectProfile"/>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="com.nvidia.cuda.ide.seven_five.configuration.debug.1479727693;com.nvidia.cuda.ide.seven_five.configuration.debug.1479727693.;nvcc.compiler.base.1979453423;nvcc.compiler.input.c.749588226">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="com.nvidia.cuda.ide.build.NVCCPerProjectProfile"/>
+		</scannerConfigBuildInfo>
+		<scannerConfigBuildInfo instanceId="com.nvidia.cuda.ide.seven_five.configuration.debug.1479727693;com.nvidia.cuda.ide.seven_five.configuration.debug.1479727693.;nvcc.compiler.base.1979453423;nvcc.compiler.input.cpp.945692641">
+			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="com.nvidia.cuda.ide.build.NVCCPerProjectProfile"/>
+		</scannerConfigBuildInfo>
+	</storageModule>
+	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Release">
+			<resource resourceType="PROJECT" workspacePath="/marian"/>
+		</configuration>
+		<configuration configurationName="Debug">
+			<resource resourceType="PROJECT" workspacePath="/marian"/>
+		</configuration>
+	</storageModule>
+	<storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
+	<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
+</cproject>
diff --git a/marian/.project b/marian/.project
new file mode 100644
index 00000000..d1163076
--- /dev/null
+++ b/marian/.project
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>marian</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
+			<triggers>clean,full,incremental,</triggers>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
+			<triggers>full,incremental,</triggers>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.cdt.core.cnature</nature>
+		<nature>org.eclipse.cdt.core.ccnature</nature>
+		<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
+		<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
+	</natures>
+	<linkedResources>
+		<link>
+			<name>src</name>
+			<type>2</type>
+			<locationURI>PARENT-1-PROJECT_LOC/src</locationURI>
+		</link>
+	</linkedResources>
+</projectDescription>
diff --git a/src/3rd_party/threadpool.h b/src/3rd_party/threadpool.h
index fb77dfe6..1938b95c 100644
--- a/src/3rd_party/threadpool.h
+++ b/src/3rd_party/threadpool.h
@@ -45,6 +45,7 @@ class ThreadPool {
     template<class F, class... Args>
     auto enqueue(F&& f, Args&&... args)
         -> std::future<typename std::result_of<F(Args...)>::type>;
+        
     ~ThreadPool();
 
     size_t getNumTasks() const {
@@ -128,6 +129,3 @@ inline ThreadPool::~ThreadPool() {
     worker.join();
   }
 }
-
-
-
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 162c53bd..c56b5c80 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -13,8 +13,13 @@ cuda_add_library(marian_lib
   graph/node_operators.cu
   tensors/tensor.cu
   kernels/tensor_operators.cu
+  kernels/dropout.cu
   layers/param_initializers.cpp
   common/utils.cpp
+  common/logging.cpp
+  common/history.cpp
+  training/config.cpp
+  translator/nth_element.cu
   data/vocab.cpp
   data/corpus.cpp
   $<TARGET_OBJECTS:libyaml-cpp>
@@ -27,30 +32,39 @@ cuda_add_executable(
   test/tensor_test.cu
 )
 
+cuda_add_executable(
+  marian_translate
+  test/marian_translate.cu
+)
+
 cuda_add_executable(
   marian_test
   test/marian_test.cu
 )
 
+cuda_add_executable(
+  bn_test
+  test/bn_test.cu
+)
+
 cuda_add_executable(
   marian
-  command/config.cpp
   command/marian.cu
 )
 
 cuda_add_executable(
   dropout_test
   test/dropout_test.cu
-  kernels/dropout_cudnn.cu
 )
 
 target_link_libraries(marian marian_lib)
 target_link_libraries(tensor_test marian_lib)
 target_link_libraries(marian_test marian_lib)
 target_link_libraries(dropout_test marian_lib)
+target_link_libraries(marian_translate marian_lib)
+target_link_libraries(bn_test marian_lib)
 
-foreach(exec tensor_test marian_test marian dropout_test)
-  target_link_libraries(${exec} ${EXT_LIBS} cudnn)
+foreach(exec dropout_test tensor_test marian_test marian_translate marian bn_test)
   target_link_libraries(${exec} ${EXT_LIBS} curand)
   cuda_add_cublas_to_target(${exec})
   set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
diff --git a/src/command/marian.cu b/src/command/marian.cu
index dd8b3301..8e255fa2 100644
--- a/src/command/marian.cu
+++ b/src/command/marian.cu
@@ -1,86 +1,13 @@
-#include <algorithm>
-#include <chrono>
-#include <iomanip>
-#include <string>
-#include <cstdio>
-#include <boost/timer/timer.hpp>
-#include <boost/chrono.hpp>
-#include <boost/program_options.hpp>
-#include <thread>
-#include <chrono>
-#include <mutex>
 
 #include "marian.h"
-#include "optimizers/optimizers.h"
-#include "optimizers/clippers.h"
-#include "data/batch_generator.h"
-#include "data/corpus.h"
-#include "models/nematus.h"
-
-#include "common/logging.h"
-#include "command/config.h"
-#include "parallel/graph_group.h"
-
-namespace marian {
-  
-  void TrainingLoop(Ptr<Config> options,
-                    Ptr<data::BatchGenerator<data::Corpus>> batchGenerator) {
-
-    auto reporter = New<Reporter>(options);
-    Ptr<GraphGroup> graphGroup = New<AsynchronousGraphGroup<Nematus>>(options);
-    graphGroup->setReporter(reporter);
- 
-    size_t epochs = 1;
-    size_t batches = 0;
-    while((options->get<size_t>("after-epochs") == 0
-           || epochs <= options->get<size_t>("after-epochs")) &&
-          (options->get<size_t>("after-batches") == 0
-           || batches < options->get<size_t>("after-batches"))) {
-
-      batchGenerator->prepare(!options->get<bool>("no-shuffle"));
-      
-      boost::timer::cpu_timer timer;
-
-      while(*batchGenerator) {
-        
-        auto batch = batchGenerator->next();
-        graphGroup->update(batch);
-        
-      }
-      epochs++;
-      LOG(info) << "Starting epoch " << epochs << " after "
-        << reporter->samples << " samples";
-    }
-    LOG(info) << "Training finshed";
-    graphGroup->save();
-  }
-}
+#include "models/gnmt.h"
 
 int main(int argc, char** argv) {
   using namespace marian;
-  using namespace data;
-  using namespace keywords;
 
-  std::shared_ptr<spdlog::logger> info;
-  info = spdlog::stderr_logger_mt("info");
-  info->set_pattern("[%Y-%m-%d %T] %v");
-
-  auto options = New<Config>(argc, argv);
-  std::cerr << *options << std::endl;
-
-  auto dimVocabs = options->get<std::vector<int>>("dim-vocabs");
-  int dimEmb = options->get<int>("dim-emb");
-  int dimRnn = options->get<int>("dim-rnn");
-  int dimBatch = options->get<int>("mini-batch");
-  int dimMaxiBatch = options->get<int>("maxi-batch");
+  auto options = New<Config>(argc, argv);;
   
-  auto trainSets = options->get<std::vector<std::string>>("trainsets");
-  auto vocabs = options->get<std::vector<std::string>>("vocabs");
-  size_t maxSentenceLength = options->get<size_t>("max-length");
-  auto corpus = New<Corpus>(trainSets, vocabs, dimVocabs, maxSentenceLength);
-  auto bg = New<BatchGenerator<Corpus>>(corpus, dimBatch, dimMaxiBatch);
- 
-  TrainingLoop(options, bg);
+  Train<AsyncGraphGroup<GNMT>>(options);
 
   return 0;
 }
diff --git a/src/common/definitions.h b/src/common/definitions.h
index 9e117a2a..997333f0 100644
--- a/src/common/definitions.h
+++ b/src/common/definitions.h
@@ -30,6 +30,7 @@
 #include <thrust/host_vector.h>
 
 #include "shape.h"
+#include "common/logging.h"
 
 namespace marian {
 
@@ -93,7 +94,6 @@ namespace marian {
   // An enumeration of directions
   enum struct dir { forward, backward, bidirect };
 
-
   /**
    * @brief Defines a set of keywords.
    *
@@ -101,27 +101,32 @@ namespace marian {
    *    will result in the creation of an instance of the Keyword class.
    */
   namespace keywords {
-    KEY(axis, int)
-    KEY(shape, Shape)
-    KEY(value, float)
-    KEY(prefix, std::string)
-    KEY(final, bool)
-    KEY(output_last, bool)
-    KEY(activation, act)
-    KEY(direction, dir)
-    KEY(mask, Expr)
-    KEY(init, std::function<void(Tensor)>)
+    KEY(axis, int);
+    KEY(shape, Shape);
+    KEY(value, float);
+    KEY(prefix, std::string);
+    KEY(final, bool);
+    KEY(output_last, bool);
+    KEY(activation, act);
+    KEY(direction, dir);
+    KEY(mask, Expr);
+    KEY(dropout_prob, float);
+    KEY(init, std::function<void(Tensor)>);
 
 
-    KEY(eta, float)
-    KEY(beta1, float)
-    KEY(beta2, float)
-    KEY(eps, float)
-    KEY(optimizer, Ptr<OptimizerBase>)
-    KEY(clip, Ptr<ClipperBase>)
-    KEY(batch_size, int)
-    KEY(max_epochs, int)
-    KEY(valid, Ptr<RunBase>)
+    KEY(eta, float);
+    KEY(beta1, float);
+    KEY(beta2, float);
+    KEY(eps, float);
+    KEY(optimizer, Ptr<OptimizerBase>);
+    KEY(clip, Ptr<ClipperBase>);
+    KEY(batch_size, int);
+    KEY(normalize, bool);
+    KEY(skip, bool);
+    KEY(skip_first, bool);
+    KEY(coverage, Expr);
+    KEY(max_epochs, int);
+    KEY(valid, Ptr<RunBase>);
   }
 
 }
diff --git a/src/common/history.cpp b/src/common/history.cpp
new file mode 100644
index 00000000..3d3ad857
--- /dev/null
+++ b/src/common/history.cpp
@@ -0,0 +1,10 @@
+#include "history.h"
+
+namespace marian {
+
+History::History(size_t lineNo)
+ : normalize_(true),
+   lineNo_(lineNo)
+{}
+
+}
diff --git a/src/common/history.h b/src/common/history.h
new file mode 100755
index 00000000..fca8a4b5
--- /dev/null
+++ b/src/common/history.h
@@ -0,0 +1,79 @@
+#pragma once
+
+#include <queue>
+
+#include "hypothesis.h"
+
+namespace marian {
+
+class History {
+  private:
+    struct HypothesisCoord {
+      bool operator<(const HypothesisCoord& hc) const {
+        return cost < hc.cost;
+      }
+
+      size_t i;
+      size_t j;
+      float cost;
+    };
+
+  public:
+    History(size_t lineNo);
+
+    void Add(const Beam& beam, bool last = false) {
+      if (beam.back()->GetPrevHyp() != nullptr) {
+        for (size_t j = 0; j < beam.size(); ++j)
+          if(beam[j]->GetWord() == 0 || last) {
+            float cost = normalize_ ? beam[j]->GetCost() / history_.size() : beam[j]->GetCost();
+            topHyps_.push({ history_.size(), j, cost });
+          }
+      }
+      history_.push_back(beam);
+    }
+
+    size_t size() const {
+      return history_.size();
+    }
+
+    NBestList NBest(size_t n) const {
+      NBestList nbest;
+      auto topHypsCopy = topHyps_;
+      while (nbest.size() < n && !topHypsCopy.empty()) {
+        auto bestHypCoord = topHypsCopy.top();
+        topHypsCopy.pop();
+
+        size_t start = bestHypCoord.i;
+        size_t j  = bestHypCoord.j;
+
+        Words targetWords;
+        Ptr<Hypothesis> bestHyp = history_[start][j];
+        while(bestHyp->GetPrevHyp() != nullptr) {
+          targetWords.push_back(bestHyp->GetWord());
+          bestHyp = bestHyp->GetPrevHyp();
+        }
+
+        std::reverse(targetWords.begin(), targetWords.end());
+        nbest.emplace_back(targetWords, history_[bestHypCoord.i][bestHypCoord.j]);
+      }
+      return nbest;
+    }
+
+    Result Top() const {
+      return NBest(1)[0];
+    }
+
+    size_t GetLineNum() const
+    { return lineNo_; }
+
+  private:
+    std::vector<Beam> history_;
+    std::priority_queue<HypothesisCoord> topHyps_;
+    bool normalize_;
+    size_t lineNo_;
+
+};
+
+typedef std::vector<History> Histories;
+
+}
diff --git a/src/common/hypothesis.h b/src/common/hypothesis.h
new file mode 100644
index 00000000..08744566
--- /dev/null
+++ b/src/common/hypothesis.h
@@ -0,0 +1,58 @@
+#pragma once
+#include <memory>
+
+#include "common/definitions.h"
+
+namespace marian {
+
+class Hypothesis {
+  public:
+    Hypothesis()
+     : prevHyp_(nullptr),
+       prevIndex_(0),
+       word_(0),
+       cost_(0.0)
+    {}
+
+    Hypothesis(const Ptr<Hypothesis> prevHyp, size_t word, size_t prevIndex, float cost)
+      : prevHyp_(prevHyp),
+        prevIndex_(prevIndex),
+        word_(word),
+        cost_(cost)
+    {}
+
+    const Ptr<Hypothesis> GetPrevHyp() const {
+      return prevHyp_;
+    }
+
+    size_t GetWord() const {
+      return word_;
+    }
+
+    size_t GetPrevStateIndex() const {
+      return prevIndex_;
+    }
+
+    float GetCost() const {
+      return cost_;
+    }
+
+    std::vector<float>& GetCostBreakdown() {
+      return costBreakdown_;
+    }
+
+  private:
+    const Ptr<Hypothesis> prevHyp_;
+    const size_t prevIndex_;
+    const size_t word_;
+    const float cost_;
+    std::vector<float> costBreakdown_;
+};
+
+typedef std::vector<Ptr<Hypothesis>> Beam;
+typedef std::vector<Beam> Beams;
+typedef std::vector<size_t> Words;
+typedef std::pair<Words, Ptr<Hypothesis>> Result;
+typedef std::vector<Result> NBestList;
+
+}
diff --git a/src/common/keywords.h b/src/common/keywords.h
index db0b5f20..01eb5898 100644
--- a/src/common/keywords.h
+++ b/src/common/keywords.h
@@ -219,7 +219,7 @@ namespace keywords {
  */
 #define KEY(name, value_type) \
 typedef const Keyword<COMPILE_TIME_CRC32_STR(#name),value_type> name ## _k; \
-name ## _k name;
+name ## _k name
 
 }
 
diff --git a/src/common/logging.cpp b/src/common/logging.cpp
new file mode 100644
index 00000000..8621205e
--- /dev/null
+++ b/src/common/logging.cpp
@@ -0,0 +1,42 @@
+#include "logging.h"
+#include "training/config.h"
+
+std::shared_ptr<spdlog::logger> stderrLogger(const std::string& name,
+                                             const std::string& pattern,
+                                             const std::vector<std::string>& files) {
+  std::vector<spdlog::sink_ptr> sinks;
+
+  auto stderr_sink = spdlog::sinks::stderr_sink_mt::instance();
+  sinks.push_back(stderr_sink);
+
+  for(auto&& file : files) {
+    auto file_sink = std::make_shared<spdlog::sinks::simple_file_sink_st>(file, true);
+    sinks.push_back(file_sink);
+  }
+
+  auto logger = std::make_shared<spdlog::logger>(name, begin(sinks), end(sinks));
+
+  spdlog::register_logger(logger);
+  logger->set_pattern(pattern);
+  return logger;
+}
+
+void createLoggers(const marian::Config& options) {
+
+  std::vector<std::string> generalLogs;
+  std::vector<std::string> validLogs;
+  if(options.has("log")) {
+    generalLogs.push_back(options.get<std::string>("log"));
+    validLogs.push_back(options.get<std::string>("log"));
+  }
+
+  if(options.has("valid-log")) {
+    validLogs.push_back(options.get<std::string>("valid-log"));
+  }
+
+  Logger info{stderrLogger("info", "[%Y-%m-%d %T] %v", generalLogs)};
+  Logger config{stderrLogger("config", "[%Y-%m-%d %T] [config] %v", generalLogs)};
+  Logger memory{stderrLogger("memory", "[%Y-%m-%d %T] [memory] %v", generalLogs)};
+  Logger data{stderrLogger("data", "[%Y-%m-%d %T] [data] %v", generalLogs)};
+  Logger valid{stderrLogger("valid", "[%Y-%m-%d %T] [valid] %v", validLogs)};
+}
diff --git a/src/common/logging.h b/src/common/logging.h
index 9959fc44..2583a071 100644
--- a/src/common/logging.h
+++ b/src/common/logging.h
@@ -3,3 +3,15 @@
 #include "spdlog/spdlog.h"
 
 #define LOG(logger) spdlog::get(#logger)->info()
+
+typedef std::shared_ptr<spdlog::logger> Logger;
+Logger stderrLogger(const std::string&, const std::string&,
+                    const std::vector<std::string>& = {});
+
+namespace marian {
+  class Config;
+}
+
+void createLoggers(const marian::Config& options);
+
+
diff --git a/src/data/batch_generator.h b/src/data/batch_generator.h
index 3c6832fc..353a4df2 100644
--- a/src/data/batch_generator.h
+++ b/src/data/batch_generator.h
@@ -5,7 +5,8 @@
 
 #include <boost/timer/timer.hpp>
 
-#include "dataset.h"
+#include "data/dataset.h"
+#include "training/config.h"
 
 namespace marian {
 
@@ -21,22 +22,24 @@ class BatchGenerator {
 
   private:
     Ptr<DataSet> data_;
+    Ptr<Config> options_;
+
     typename DataSet::iterator current_;
 
-    size_t batchSize_;
     size_t maxiBatchSize_;
 
     std::deque<BatchPtr> bufferedBatches_;
     BatchPtr currentBatch_;
 
-    void fillBatches() {
+    void fillBatches(bool shuffle=true) {
       auto cmp = [](const sample& a, const sample& b) {
         return a[0].size() < b[0].size();
       };
 
       std::priority_queue<sample, samples, decltype(cmp)> maxiBatch(cmp);
 
-      while(current_ != data_->end() && maxiBatch.size() < maxiBatchSize_) {
+      int maxSize = options_->get<int>("mini-batch") * options_->get<int>("maxi-batch");
+      while(current_ != data_->end() && maxiBatch.size() < maxSize) {
         maxiBatch.push(*current_);
         current_++;
       }
@@ -45,7 +48,7 @@ class BatchGenerator {
       while(!maxiBatch.empty()) {
         batchVector.push_back(maxiBatch.top());
         maxiBatch.pop();
-        if(batchVector.size() == batchSize_) {
+        if(batchVector.size() == options_->get<int>("mini-batch")) {
           bufferedBatches_.push_back(data_->toBatch(batchVector));
           batchVector.clear();
         }
@@ -53,17 +56,15 @@ class BatchGenerator {
       if(!batchVector.empty())
         bufferedBatches_.push_back(data_->toBatch(batchVector));
 
-      std::random_shuffle(bufferedBatches_.begin(), bufferedBatches_.end());
+      if(shuffle)
+        std::random_shuffle(bufferedBatches_.begin(), bufferedBatches_.end());
     }
 
   public:
     BatchGenerator(Ptr<DataSet> data,
-                   size_t batchSize=80,
-                   size_t maxiBatchNum=20)
+                   Ptr<Config> options)
     : data_(data),
-      batchSize_(batchSize),
-      maxiBatchSize_(batchSize * maxiBatchNum)
-      { }
+      options_(options) { }
 
     operator bool() const {
       return !bufferedBatches_.empty();
@@ -84,8 +85,10 @@ class BatchGenerator {
     void prepare(bool shuffle=true) {
       if(shuffle)
         data_->shuffle();
+      else
+        data_->reset();
       current_ = data_->begin();
-      fillBatches();
+      fillBatches(shuffle);
     }
 };
 
diff --git a/src/data/corpus.cpp b/src/data/corpus.cpp
index 03da5ea2..6230bd68 100644
--- a/src/data/corpus.cpp
+++ b/src/data/corpus.cpp
@@ -1,5 +1,6 @@
 #include <random>
-#include "corpus.h"
+
+#include "data/corpus.h"
 
 namespace marian {
 namespace data {
@@ -33,20 +34,53 @@ const SentenceTuple& CorpusIterator::dereference() const {
   return tup_;
 }
 
-Corpus::Corpus(const std::vector<std::string>& textPaths,
-               const std::vector<std::string>& vocabPaths,
-               const std::vector<int>& maxVocabs,
-               size_t maxLength)
-  : textPaths_(textPaths),
-    maxLength_(maxLength)
-{
-  UTIL_THROW_IF2(textPaths.size() != vocabPaths.size(),
+Corpus::Corpus(Ptr<Config> options)
+  : options_(options),
+    textPaths_(options_->get<std::vector<std::string>>("train-sets")),
+    maxLength_(options_->get<size_t>("max-length")) {
+
+  std::vector<std::string> vocabPaths;
+  if(options_->has("vocabs"))
+    vocabPaths = options_->get<std::vector<std::string>>("vocabs");
+
+  UTIL_THROW_IF2(!vocabPaths.empty() && textPaths_.size() != vocabPaths.size(),
                  "Number of corpus files and vocab files does not agree");
 
+  std::vector<int> maxVocabs =
+    options_->get<std::vector<int>>("dim-vocabs");
+
   std::vector<Vocab> vocabs;
-  for(int i = 0; i < vocabPaths.size(); ++i) {
-    vocabs_.emplace_back(vocabPaths[i], maxVocabs[i]);
+  if(vocabPaths.empty()) {
+    for(int i = 0; i < textPaths_.size(); ++i) {
+      Ptr<Vocab> vocab = New<Vocab>();
+      vocab->loadOrCreate(textPaths_[i], maxVocabs[i]);
+      vocabs_.emplace_back(vocab);
+    }
   }
+  else {
+    for(int i = 0; i < vocabPaths.size(); ++i) {
+      Ptr<Vocab> vocab = New<Vocab>();
+      vocab->load(vocabPaths[i], maxVocabs[i]);
+      vocabs_.emplace_back(vocab);
+    }
+  }
+
+
+  for(auto path : textPaths_) {
+    files_.emplace_back(new InputFileStream(path));
+  }
+}
+
+Corpus::Corpus(std::vector<std::string> paths,
+               std::vector<Ptr<Vocab>> vocabs,
+               Ptr<Config> options)
+  : options_(options),
+    textPaths_(paths),
+    vocabs_(vocabs),
+    maxLength_(options_->get<size_t>("max-length")) {
+
+  UTIL_THROW_IF2(textPaths_.size() != vocabs_.size(),
+                 "Number of corpus files and vocab files does not agree");
 
   for(auto path : textPaths_) {
     files_.emplace_back(new InputFileStream(path));
@@ -61,7 +95,7 @@ SentenceTuple Corpus::next() {
     for(int i = 0; i < files_.size(); ++i) {
       std::string line;
       if(std::getline((std::istream&)*files_[i], line)) {
-        Words words = vocabs_[i](line);
+        Words words = (*vocabs_[i])(line);
         if(words.empty())
           words.push_back(0);
         tup.push_back(words);
@@ -82,8 +116,15 @@ void Corpus::shuffle() {
   shuffleFiles(textPaths_);
 }
 
+void Corpus::reset() {
+  files_.clear();
+  for(auto& path : textPaths_) {
+    files_.emplace_back(new InputFileStream(path));
+  }
+}
+
 void Corpus::shuffleFiles(const std::vector<std::string>& paths) {
-  std::cerr << "Shuffling files" << std::endl;
+  LOG(data) << "Shuffling files";
   std::vector<std::vector<std::string>> corpus;
 
   files_.clear();
@@ -129,7 +170,7 @@ void Corpus::shuffleFiles(const std::vector<std::string>& paths) {
     files_.emplace_back(new InputFileStream(path));
   }
 
-  std::cerr << "Done" << std::endl;
+  LOG(data) << "Done";
 }
 
 }
diff --git a/src/data/corpus.h b/src/data/corpus.h
index 58233a6e..c3a0c358 100644
--- a/src/data/corpus.h
+++ b/src/data/corpus.h
@@ -4,6 +4,7 @@
 #include <fstream>
 #include <boost/iterator/iterator_facade.hpp>
 
+#include "training/config.h"
 #include "common/definitions.h"
 #include "data/vocab.h"
 #include "common/file_stream.h"
@@ -38,11 +39,11 @@ class CorpusBatch {
           }
           std::cerr << std::endl;
 
-          std::cerr << "\t m: ";
-          for(auto w : b.second) {
-            std::cerr << w << " ";
-          }
-          std::cerr << std::endl;
+          //std::cerr << "\t m: ";
+          //for(auto w : b.second) {
+            //std::cerr << w << " ";
+          //}
+          //std::cerr << std::endl;
         }
       }
     }
@@ -88,9 +89,11 @@ class CorpusIterator
 
 class Corpus {
   private:
+    Ptr<Config> options_;
+
     std::vector<std::string> textPaths_;
     std::vector<UPtr<InputFileStream>> files_;
-    std::vector<Vocab> vocabs_;
+    std::vector<Ptr<Vocab>> vocabs_;
     size_t maxLength_;
 
     void shuffleFiles(const std::vector<std::string>& paths);
@@ -102,14 +105,17 @@ class Corpus {
     typedef CorpusIterator iterator;
     typedef SentenceTuple sample;
 
-    Corpus(const std::vector<std::string>& textPaths,
-           const std::vector<std::string>& vocabPaths,
-           const std::vector<int>& maxVocabs,
-           size_t maxLength = 50);
+    Corpus(Ptr<Config> options);
+    
+    Corpus(std::vector<std::string> paths,
+           std::vector<Ptr<Vocab>> vocabs,
+           Ptr<Config> options);
 
     sample next();
 
     void shuffle();
+    
+    void reset();
 
     iterator begin() {
       return iterator(*this);
@@ -118,6 +124,10 @@ class Corpus {
     iterator end() {
       return iterator();
     }
+    
+    std::vector<Ptr<Vocab>>& getVocabs() {
+      return vocabs_;
+    }
 
     batch_ptr toBatch(const std::vector<sample>& batchVector) {
       int batchSize = batchVector.size();
diff --git a/src/data/mnist.h b/src/data/mnist.h
deleted file mode 100644
index bf1815dc..00000000
--- a/src/data/mnist.h
+++ /dev/null
@@ -1,188 +0,0 @@
-#pragma once
-
-// This file is part of the Marian toolkit.
-// Marian is copyright (c) 2016 Marcin Junczys-Dowmunt.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-// SOFTWARE.
-
-#include <string>
-#include <iostream>
-#include <fstream>
-#include <vector>
-#include <cassert>
-#include <algorithm>
-
-#include "dataset.h"
-#include "batch_generator.h"
-
-namespace marian {
-namespace data {
-
-/** @brief DataBase capable of reading <a href="http://yann.lecun.com/exdb/mnist/">MNIST</a> data. */
-class MNIST : public DataBase {
-  private:
-    const int IMAGE_MAGIC_NUMBER;
-    const int LABEL_MAGIC_NUMBER;
-
-    Examples examples_;
-
-  public:
-
-    typedef Batch batch_type;
-    typedef std::shared_ptr<batch_type> batch_ptr;
-
-    /**
-     * @brief Constructs a DataBase using <a href="http://yann.lecun.com/exdb/mnist/">MNIST</a> data.
-     *
-     * @param featuresPath Path to file containing <a href="http://yann.lecun.com/exdb/mnist/">MNIST</a> feature values
-     * @param labelsPath   Path to file containing <a href="http://yann.lecun.com/exdb/mnist/">MNIST</a> labels
-     */
-    MNIST(const std::string& featuresPath,
-          const std::string& labelsPath)
-    : IMAGE_MAGIC_NUMBER(2051),
-      LABEL_MAGIC_NUMBER(2049)
-    {
-      auto features = ReadImages(featuresPath);
-      auto labels = ReadLabels(labelsPath);
-
-      UTIL_THROW_IF2(features.size() != labels.size(),
-                     "Features do not match labels");
-
-      for(int i = 0; i < features.size(); ++i)
-        examples_.emplace_back(new Example({ features[i], labels[i] }));
-    }
-
-    ExampleIterator begin() const {
-      return ExampleIterator(examples_.begin());
-    }
-
-    ExampleIterator end() const {
-      return ExampleIterator(examples_.end());
-    }
-
-    void shuffle() {
-      std::random_shuffle(examples_.begin(), examples_.end());
-    }
-
-    batch_ptr toBatch(const Examples& batchVector) {
-      int batchSize = batchVector.size();
-
-      std::vector<int> maxDims;
-      for(auto& ex : batchVector) {
-        if(maxDims.size() < ex->size())
-          maxDims.resize(ex->size(), 0);
-        for(int i = 0; i < ex->size(); ++i) {
-          if((*ex)[i]->size() > maxDims[i])
-          maxDims[i] = (*ex)[i]->size();
-        }
-      }
-
-      batch_ptr batch(new Batch());
-      std::vector<Input::iterator> iterators;
-      for(auto& m : maxDims) {
-        batch->push_back(Shape({batchSize, m}));
-        iterators.push_back(batch->inputs().back().begin());
-      }
-
-      for(auto& ex : batchVector) {
-        for(int i = 0; i < ex->size(); ++i) {
-          DataPtr d = (*ex)[i];
-          d->resize(maxDims[i], 0.0f);
-          iterators[i] = std::copy(d->begin(), d->end(), iterators[i]);
-        }
-      }
-      return batch;
-    }
-
-  private:
-    typedef unsigned char uchar;
-
-    int reverseInt(int i) {
-      unsigned char c1, c2, c3, c4;
-      c1 = i & 255, c2 = (i >> 8) & 255, c3 = (i >> 16) & 255, c4 = (i >> 24) & 255;
-      return ((int)c1 << 24) + ((int)c2 << 16) + ((int)c3 << 8) + c4;
-    }
-
-    std::vector<DataPtr> ReadImages(const std::string& full_path) {
-      std::ifstream file(full_path);
-      UTIL_THROW_IF2(!file.is_open(),
-                     "Cannot open file `" + full_path + "`!");
-
-      int magic_number = 0;
-      file.read((char *)&magic_number, sizeof(magic_number));
-      magic_number = reverseInt(magic_number);
-
-      UTIL_THROW_IF2(magic_number != IMAGE_MAGIC_NUMBER,
-                     "Invalid MNIST image file!");
-
-      int number_of_images;
-      int n_rows = 0;
-      int n_cols = 0;
-
-      file.read((char *)&number_of_images, sizeof(number_of_images));
-      number_of_images = reverseInt(number_of_images);
-      file.read((char *)&n_rows, sizeof(n_rows));
-      n_rows = reverseInt(n_rows);
-      file.read((char *)&n_cols, sizeof(n_cols));
-      n_cols = reverseInt(n_cols);
-
-      int imgSize = n_rows * n_cols;
-      std::vector<DataPtr> _dataset(number_of_images);
-      for(int i = 0; i < number_of_images; ++i) {
-        _dataset[i].reset(new Data(imgSize, 0));
-        for (int j = 0; j < imgSize; j++) {
-          unsigned char pixel = 0;
-          file.read((char*)&pixel, sizeof(pixel));
-          (*_dataset[i])[j] = pixel / 255.0f;
-        }
-      }
-      return _dataset;
-    }
-
-    std::vector<DataPtr> ReadLabels(const std::string& full_path) {
-      std::ifstream file(full_path);
-
-      if (! file.is_open())
-        throw std::runtime_error("Cannot open file `" + full_path + "`!");
-
-      int magic_number = 0;
-      file.read((char *)&magic_number, sizeof(magic_number));
-      magic_number = reverseInt(magic_number);
-
-      if (magic_number != LABEL_MAGIC_NUMBER)
-        throw std::runtime_error("Invalid MNIST label file!");
-
-      int number_of_labels;
-      file.read((char *)&number_of_labels, sizeof(number_of_labels));
-      number_of_labels = reverseInt(number_of_labels);
-
-      std::vector<DataPtr> _dataset(number_of_labels);
-      for (int i = 0; i < number_of_labels; i++) {
-        _dataset[i].reset(new Data(1, 0.0f));
-        unsigned char label;
-        file.read((char*)&label, 1);
-        (*_dataset[i])[0] = label;
-      }
-
-      return _dataset;
-    }
-};
-
-} // namespace mnist
-}
diff --git a/src/data/trainer.h b/src/data/trainer.h
deleted file mode 100644
index 3c9420df..00000000
--- a/src/data/trainer.h
+++ /dev/null
@@ -1,142 +0,0 @@
-#pragma once
-
-#include <iostream>
-#include <iomanip>
-#include <boost/timer/timer.hpp>
-
-#include "common/keywords.h"
-#include "common/definitions.h"
-#include "graph/expression_graph.h"
-#include "optimizers/optimizers.h"
-#include "data/batch_generator.h"
-
-namespace marian {
-
-class RunBase {
-  public:
-    virtual void run() = 0;
-};
-
-typedef std::shared_ptr<RunBase> RunBasePtr;
-
-template <class DataSet>
-class Trainer : public RunBase,
-                public keywords::Keywords {
-  private:
-    ExpressionGraphPtr graph_;
-    std::shared_ptr<DataSet> dataset_;
-
-  public:
-    template <typename ...Args>
-    Trainer(ExpressionGraphPtr graph,
-            std::shared_ptr<DataSet> dataset,
-            Args... args)
-     : Keywords(args...),
-       graph_(graph),
-       dataset_(dataset)
-    {}
-
-    void run() {
-        using namespace data;
-        using namespace keywords;
-        boost::timer::cpu_timer trainTimer;
-
-        auto opt = Get(optimizer, Optimizer<Adam>());
-        auto batchSize = Get(batch_size, 200);
-        auto maxEpochs = Get(max_epochs, 50);
-        BatchGenerator<DataSet> bg(dataset_, batchSize);
-
-        auto validator = Get(valid, RunBasePtr());
-
-        size_t update = 0;
-        for(int epoch = 1; epoch <= maxEpochs; ++epoch) {
-          boost::timer::cpu_timer epochTimer;
-          bg.prepare();
-
-          float cost = 0;
-          float totalExamples = 0;
-          while(bg) {
-            auto batch = bg.next();
-            opt->update(graph_);
-            cost += graph_->get("cost")->val()->scalar() * batch->dim();
-            totalExamples += batch->dim();
-            update++;
-          }
-          cost = cost / totalExamples;
-
-          std::cerr << "Epoch: " << std::setw(std::to_string(maxEpochs).size())
-            << epoch << "/" << maxEpochs << " - Update: " << update
-            << " - Cost: " << std::fixed << std::setprecision(4) << cost
-            << " - Time: " << epochTimer.format(2, "%ws")
-            << " - " << trainTimer.format(0, "%ws") << std::endl;
-
-          if(validator)
-            validator->run();
-        }
-    }
-};
-
-template <class DataSet>
-class Validator : public RunBase,
-                  public keywords::Keywords {
-  private:
-    ExpressionGraphPtr graph_;
-    std::shared_ptr<DataSet> dataset_;
-
-    float correct(const std::vector<float> pred, const std::vector<float> labels) {
-      size_t num = labels.size();
-      size_t scores = pred.size() / num;
-      size_t acc = 0;
-      for (size_t i = 0; i < num; ++i) {
-        size_t proposed = 0;
-        for(size_t j = 0; j < scores; ++j) {
-          if(pred[i * scores + j] > pred[i * scores + proposed])
-            proposed = j;
-        }
-        acc += (proposed == labels[i]);
-      }
-      return (float)acc;
-    }
-
-  public:
-    template <typename ...Args>
-    Validator(ExpressionGraphPtr graph,
-              std::shared_ptr<DataSet> dataset,
-              Args... args)
-     : Keywords(args...),
-       graph_(graph),
-       dataset_(dataset)
-    {}
-
-    void run() {
-        using namespace data;
-        using namespace keywords;
-
-        auto batchSize = Get(batch_size, 200);
-        BatchGenerator<DataSet> bg(dataset_, batchSize);
-
-        size_t update = 0;
-        bg.prepare(false);
-
-        float total = 0;
-        float cor = 0;
-        while(bg) {
-            auto batch = bg.next();
-            graph_->forward();
-            std::vector<float> scores;
-            graph_->get("scores")->val()->get(scores);
-
-            cor += correct(scores, batch->inputs()[1].data());
-            total += batch->dim();
-            update++;
-        }
-        std::cerr << "Accuracy: " << cor / total << std::endl;
-    }
-};
-
-template <class Process, typename ...Args>
-RunBasePtr Run(Args&& ...args) {
-  return RunBasePtr(new Process(args...));
-}
-
-}
diff --git a/src/data/types.h b/src/data/types.h
index d126395d..4232a985 100644
--- a/src/data/types.h
+++ b/src/data/types.h
@@ -7,6 +7,8 @@
 typedef size_t Word;
 typedef std::vector<Word> Words;
 
-const Word EOS = 0;
-const Word UNK = 1;
+const Word EOS_ID = 0;
+const Word UNK_ID = 1;
 
+const std::string EOS_STR = "</s>";
+const std::string UNK_STR = "<unk>";
diff --git a/src/data/vocab.cpp b/src/data/vocab.cpp
index 372806f9..4d62c765 100644
--- a/src/data/vocab.cpp
+++ b/src/data/vocab.cpp
@@ -1,26 +1,15 @@
 
 #include <sstream>
+#include <algorithm>
 
 #include "data/vocab.h"
 #include "common/utils.h"
 #include "common/file_stream.h"
 #include "3rd_party/exception.h"
 #include "3rd_party/yaml-cpp/yaml.h"
+#include "common/logging.h"
 
-Vocab::Vocab(const std::string& path, int max) {
-    YAML::Node vocab = YAML::Load(InputFileStream(path));
-    for(auto&& pair : vocab) {
-      auto str = pair.first.as<std::string>();
-      auto id = pair.second.as<Word>();
-      if (id < (Word)max) {
-        str2id_[str] = id;
-        if(id >= id2str_.size())
-          id2str_.resize(id + 1);
-        id2str_[id] = str;
-      }
-    }
-    UTIL_THROW_IF2(id2str_.empty(), "Empty vocabulary " << path);
-    id2str_[0] = "</s>";
+Vocab::Vocab() {
 }
 
 size_t Vocab::operator[](const std::string& word) const {
@@ -28,7 +17,7 @@ size_t Vocab::operator[](const std::string& word) const {
     if(it != str2id_.end())
         return it->second;
     else
-        return 1;
+        return UNK_ID;
 }
 
 Words Vocab::operator()(const std::vector<std::string>& lineTokens, bool addEOS) const {
@@ -36,7 +25,7 @@ Words Vocab::operator()(const std::vector<std::string>& lineTokens, bool addEOS)
   std::transform(lineTokens.begin(), lineTokens.end(), words.begin(),
                   [&](const std::string& w) { return (*this)[w]; });
   if(addEOS)
-    words.push_back(EOS);
+    words.push_back(EOS_ID);
   return words;
 }
 
@@ -49,7 +38,7 @@ Words Vocab::operator()(const std::string& line, bool addEOS) const {
 std::vector<std::string> Vocab::operator()(const Words& sentence, bool ignoreEOS) const {
   std::vector<std::string> decoded;
   for(size_t i = 0; i < sentence.size(); ++i) {
-    if(sentence[i] != EOS || !ignoreEOS) {
+    if(sentence[i] != EOS_ID || !ignoreEOS) {
       decoded.push_back((*this)[sentence[i]]);
     }
   }
@@ -65,3 +54,91 @@ const std::string& Vocab::operator[](size_t id) const {
 size_t Vocab::size() const {
   return id2str_.size();
 }
+
+void Vocab::loadOrCreate(const std::string& trainPath, int max)
+{
+  if(boost::filesystem::exists(trainPath + ".json")) {
+    load(trainPath + ".json", max);
+    return;
+  }
+  if(boost::filesystem::exists(trainPath + ".yml")) {
+    load(trainPath + ".yml", max);
+    return;
+  }
+
+  create(trainPath + ".yml", max, trainPath);
+  load(trainPath + ".yml", max);
+}
+
+void Vocab::load(const std::string& vocabPath, int max)
+{
+  LOG(data) << "Loading vocabulary from " << vocabPath << " (max: " << max << ")";
+  YAML::Node vocab = YAML::Load(InputFileStream(vocabPath));
+  for(auto&& pair : vocab) {
+    auto str = pair.first.as<std::string>();
+    auto id = pair.second.as<Word>();
+    if (id < (Word)max) {
+      str2id_[str] = id;
+      if(id >= id2str_.size())
+        id2str_.resize(id + 1);
+      id2str_[id] = str;
+    }
+  }
+  UTIL_THROW_IF2(id2str_.empty(), "Empty vocabulary " << vocabPath);
+
+  id2str_[EOS_ID] = EOS_STR;
+  id2str_[UNK_ID] = UNK_STR;
+}
+
+class Vocab::VocabFreqOrderer
+{
+public:
+  bool operator()(const Vocab::Str2Id::value_type* a, const Vocab::Str2Id::value_type* b) const {
+    return a->second < b->second;
+  }
+};
+
+void Vocab::create(const std::string& vocabPath, int max, const std::string& trainPath)
+{
+  LOG(data) << "Creating vocabulary " << vocabPath
+    << " from " << trainPath << " (max: " << max << ")";
+
+  UTIL_THROW_IF2(boost::filesystem::exists(vocabPath),
+                 "Vocab file " << vocabPath << " exist. Not overwriting");
+
+  InputFileStream trainStrm(trainPath);
+
+  Str2Id vocab;
+  std::string line;
+  while (getline((std::istream&)trainStrm, line)) {
+    std::vector<std::string> toks;
+    Split(line, toks);
+
+    for (const std::string &tok: toks) {
+      Str2Id::iterator iter = vocab.find(tok);
+      if (iter == vocab.end())
+        vocab[tok] = 1;
+      else
+        iter->second++;
+    }
+  }
+
+  // put into vector & sort
+  std::vector<const Str2Id::value_type*> vocabVec;
+  vocabVec.reserve(max);
+
+  for (const Str2Id::value_type &p: vocab)
+    vocabVec.push_back(&p);
+  std::sort(vocabVec.rbegin(), vocabVec.rend(), VocabFreqOrderer());
+
+  YAML::Node vocabYaml;
+  vocabYaml[EOS_STR] = EOS_ID;
+  vocabYaml[UNK_STR] = UNK_ID;
+  for(size_t i = 0; i < vocabVec.size(); ++i) {
+    const Str2Id::value_type *p = vocabVec[i];
+    vocabYaml[p->first] = i + 2;
+  }
+
+  OutputFileStream vocabStrm(vocabPath);
+  (std::ostream&)vocabStrm << vocabYaml;
+}
diff --git a/src/data/vocab.h b/src/data/vocab.h
index e7e2c416..e61ad605 100644
--- a/src/data/vocab.h
+++ b/src/data/vocab.h
@@ -8,7 +8,7 @@
 
 class Vocab {
   public:
-    Vocab(const std::string& path, int max = 50000);
+    Vocab();
 
     size_t operator[](const std::string& word) const;
 
@@ -22,7 +22,16 @@ class Vocab {
 
     size_t size() const;
 
+    void loadOrCreate(const std::string& textPath, int max);
+    void load(const std::string& vocabPath, int max);
+    void create(const std::string& vocabPath, int max, const std::string& trainPath);
+
   private:
-    std::map<std::string, size_t> str2id_;
-    std::vector<std::string> id2str_;
+    typedef std::map<std::string, size_t> Str2Id;
+    Str2Id str2id_;
+
+    typedef std::vector<std::string> Id2Str;
+    Id2Str id2str_;
+
+    class VocabFreqOrderer;
 };
diff --git a/src/graph/chainable.h b/src/graph/chainable.h
index ef358325..07bc294e 100644
--- a/src/graph/chainable.h
+++ b/src/graph/chainable.h
@@ -23,6 +23,7 @@
 
 #include <vector>
 #include <memory>
+#include <boost/functional/hash.hpp>
 
 #include "exception.h"
 
@@ -106,6 +107,8 @@ struct Chainable {
     virtual void debug(const std::string& message) = 0;
     virtual bool marked_for_debug() = 0;
     virtual const std::string& debug_message() = 0;
+
+    virtual size_t hash() = 0;
 };
 
 /**
diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h
index 15885553..dc30803f 100644
--- a/src/graph/expression_graph.h
+++ b/src/graph/expression_graph.h
@@ -1,26 +1,5 @@
 #pragma once
 
-// This file is part of the Marian toolkit.
-// Marian is copyright (c) 2016 Marcin Junczys-Dowmunt.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-// SOFTWARE.
-
 #include <map>
 #include <unordered_set>
 #include <fstream>
@@ -32,7 +11,9 @@
 #include "data/batch_generator.h"
 #include "tensors/tensor_allocator.h"
 #include "layers/param_initializers.h"
+#include "kernels/dropout.h"
 #include "3rd_party/threadpool.h"
+#include "3rd_party/cnpy/cnpy.h"
 
 namespace marian {
 
@@ -66,9 +47,10 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
     Ptr<TensorAllocator> tensors_;
 
     cublasHandle_t cublasHandle_;
+    curandGenerator_t curandGenerator_;
     size_t device_{0};
-    
-    size_t stale_{0};
+
+    std::unordered_map<size_t, Expr> hashMap_;
 
   protected:
     /** @brief Constructs a new expression graph
@@ -84,17 +66,26 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
 
   public:
 
+    ~ExpressionGraph() {
+      clear();
+    }
+
     void setDevice(size_t device = 0) {
       device_ = device;
       params_.init(device);
       tensors_ = New<TensorAllocator>(device);
       cublasHandle_ = create_handle(device);
+      curandGenerator_ = createCurandGenerator(device, 1234);
     }
 
     cublasHandle_t getCublasHandle() {
       return cublasHandle_;
     }
 
+    curandGenerator_t getCurandGenerator() {
+      return curandGenerator_;
+    }
+
     size_t getDevice() {
       return device_;
     }
@@ -132,26 +123,34 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
      * @param batchSize       XXX Marcin, could you provide a description of this param?
      */
 
-    void forward() {
+    size_t forward() {
       params_.allocateForward();
-      for(auto&& tape : tapes_) {
-        for(auto&& v : tape) {
-          v->allocate();
-          v->init();
-          v->forward();
+      return forward(0);
+    }
 
-          // @TODO: should be done in node
-          for(auto&& child : v->children()) {
-            v->decreaseEdges(1);
-            child->decreaseEdges(1);
-          }
+    size_t forward(size_t pos) {
+      // @TODO: check if allocation works properly
 
-          if(v->marked_for_debug()) {
-            std::cerr << "Debug: " << v->debug_message() << std::endl;
-            std::cerr << v->val()->debug() << std::endl;
-          }
+      auto it = nodes_.begin() + pos;
+      while(it != nodes_.end()) {
+        auto v = *it;
+        v->allocate();
+        v->init();
+        v->forward();
+
+        // @TODO: should be done in node
+        for(auto&& child : v->children()) {
+          v->decreaseEdges(1);
+          child->decreaseEdges(1);
         }
+
+        if(v->marked_for_debug()) {
+          std::cerr << "Debug: " << v->debug_message() << std::endl;
+          std::cerr << v->val()->debug() << std::endl;
+        }
+        it++;
       }
+      return std::distance(nodes_.begin(), it);
     }
 
     /**
@@ -172,7 +171,7 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
 
       params_.allocateBackward();
       params_.set_zero_adjoint();
-      
+
       for(auto&& v : topNodes_)
         v->init_dependent();
 
@@ -202,7 +201,7 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
         it++;
       }
     }
-    
+
     /**
      * @brief Returns a string representing this expression graph in <code>graphviz</code> notation.
      *
@@ -300,8 +299,6 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
      *
      * This method does not attach the new constant node to any existing expression graph.
      *
-     * @param args           XXX Marcin, what are args here?
-     *
      * @return a newly constructed constant node
      */
     template <typename ...Args>
@@ -341,6 +338,17 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
                                       args...);
     }
 
+    template <typename ...Args>
+    inline Expr dropout(float prob, Shape shape) {
+      auto dropoutInit = [prob, this](Tensor t) {
+        Dropout(t, prob, getCurandGenerator());
+      };
+
+      return Expression<ConstantNode>(shared_from_this(),
+                                      keywords::init=dropoutInit,
+                                      keywords::shape=shape);
+    }
+
     /*********************************************************/
 
     /**
@@ -387,10 +395,18 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
       named_.emplace(name, e);
     }
 
-    void add(Expr node) {
+    Expr add(Expr node) {
       size_t group = 0;
 
+      size_t hash = node->hash();
+      auto it = hashMap_.find(hash);
+      if(it != hashMap_.end())
+        return it->second;
+
+      hashMap_[hash] = node;
+
       node->setId(count_++);
+
       for(auto& child: node->children()) {
         group = std::max(group, tapeMap_[child] + 1);
         child->increaseEdges(2);
@@ -402,6 +418,8 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
       tapes_[group].push_back(node);
       nodes_.push_back(node);
       topNodes_.insert(node);
+
+      return node;
     }
 
     void remove_top_node(Expr node) {
@@ -428,18 +446,72 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
       inputs_.clear();
       topNodes_.clear();
       tensors_->clear();
+      hashMap_.clear();
     }
-    
+
     Expr topNode() {
       return nodes_.back();
     }
+
+    void load(const std::string& name) {
+      using namespace keywords;
+
+      LOG(info) << "Loading model from " << name;
+
+      auto numpy = cnpy::npz_load(name);
+
+      for(auto it : numpy) {
+        auto name = it.first;
+
+        Shape shape;
+        if(it.second.shape.size() == 2) {
+          shape.set(0, it.second.shape[0]);
+          shape.set(1, it.second.shape[1]);
+        }
+        else if(it.second.shape.size() == 1) {
+          shape.set(0, 1);
+          shape.set(1, it.second.shape[0]);
+        }
+
+        param(name, shape,
+              init=inits::from_numpy(it.second));
+      }
+    }
+
+    void save(const std::string& name) {
+      LOG(info) << "Saving model to " << name;
+
+      unsigned shape[2];
+      std::string mode = "w";
+
+      cudaSetDevice(getDevice());
+      for(auto p : params().getMap()) {
+        std::vector<float> v;
+        p.second->val() >> v;
+
+        unsigned dim;
+        if(p.second->shape()[0] == 1) {
+          shape[0] = p.second->shape()[1];
+          dim = 1;
+        }
+        else {
+          shape[0] = p.second->shape()[0];
+          shape[1] = p.second->shape()[1];
+          dim = 2;
+        }
+        std::string pName = p.first;
+        cnpy::npz_save(name, pName, v.data(), shape, dim, mode);
+        mode = "a";
+      }
+    }
 };
 
 template <class T, typename ...Args>
 Expr Expression(Args&& ... args) {
+  // @TODO check hash, if exists do not add and return
+  // cached node to minimize calculations
   auto e = Expr(new T(std::forward<Args>(args)...));
-  e->graph()->add(e);
-  return e;
+  return e->graph()->add(e);
 }
 
 }
diff --git a/src/graph/expression_operators.cu b/src/graph/expression_operators.cu
index aa0a8245..40687bd1 100644
--- a/src/graph/expression_operators.cu
+++ b/src/graph/expression_operators.cu
@@ -118,11 +118,6 @@ Expr tanh(const std::vector<Expr>& nodes) {
   return Expression<TanhNodeOp>(nodes);
 }
 
-//Expr tanh(Expr a, Expr b, Expr c) {
-//  std::vector<Expr> nodes = {a, b, c};
-//  return Expression<TanhPlus3NodeOp>(nodes);
-//}
-
 Expr logit(const std::vector<Expr>&) {
   UTIL_THROW2("Not implemented");
 }
@@ -131,5 +126,30 @@ Expr relu(const std::vector<Expr>&) {
   UTIL_THROW2("Not implemented");
 }
 
+Expr sqrt(Expr a, float eps) {
+  return Expression<SqrtNodeOp>(a, eps);
+}
+
+Expr square(Expr a) {
+  return Expression<SquareNodeOp>(a);
+}
+
+Expr layer_norm(Expr x, Expr gamma, Expr beta) {
+  std::vector<Expr> nodes = {x, gamma};
+  if(beta)
+    nodes.push_back(beta);
+  return Expression<LayerNormalizationOp>(nodes);
+}
+
+//Expr batch_norm(Expr x, Expr gamma, Expr beta) {
+//  auto mju = mean(x, keywords::axis=0);
+//  auto xmmju = x - mju;
+//  auto std = sqrt(mean(square(xmmju), keywords::axis=0), 1e-9);
+//
+//  if(beta)
+//    return gamma * (xmmju / std) + beta;
+//  else
+//    return gamma * (xmmju / std);
+//}
 
 }
diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h
index 095d9219..734ef372 100644
--- a/src/graph/expression_operators.h
+++ b/src/graph/expression_operators.h
@@ -143,4 +143,25 @@ Expr weighted_average(Expr in, Expr weights, Args ...args) {
 
 Expr step(Expr a, size_t step);
 
+Expr sqrt(Expr a, float eps = 0.f);
+Expr square(Expr a);
+
+Expr layer_norm(Expr x, Expr gamma, Expr beta = nullptr);
+//Expr batch_norm(Expr x, Expr gamma, Expr beta = nullptr);
+
+template <typename ...Args>
+Expr dropout(Expr x, Args ...args) {
+  auto mask = Get(keywords::mask, nullptr, args...);
+  float dropout_prob = Get(keywords::dropout_prob, 0.0f, args...);
+
+  UTIL_THROW_IF2(!mask && !dropout_prob,
+                 "Neither mask nor dropout prob given");
+  if(!mask) {
+    auto graph = x->graph();
+    mask = graph->dropout(dropout_prob, x->shape());
+  }
+  return x * mask;
+}
+
+
 }
diff --git a/src/graph/node.h b/src/graph/node.h
index 674eef53..65de3880 100644
--- a/src/graph/node.h
+++ b/src/graph/node.h
@@ -186,6 +186,7 @@ class Node : public Chainable<Tensor>,
 };
 
 struct NaryNodeOp : public Node {
+  size_t hash_{0};
   std::vector<Expr> children_;
 
   template <typename ...Args>
@@ -205,6 +206,17 @@ struct NaryNodeOp : public Node {
     return children_;
   }
 
+  virtual size_t hash() {
+    if(!hash_) {
+      std::size_t seed = boost::hash<std::string>()(name());
+      boost::hash_combine(seed, type());
+      for(auto child : children())
+        boost::hash_combine(seed, child->hash());
+      hash_ = seed;
+    }
+    return hash_;
+  }
+
   void remove_children_from_top_nodes();
 };
 
diff --git a/src/graph/node_operators.h b/src/graph/node_operators.h
index af5bf315..98701e3a 100644
--- a/src/graph/node_operators.h
+++ b/src/graph/node_operators.h
@@ -82,6 +82,11 @@ struct ConstantNode : public Node {
     return "white";
   }
 
+  virtual size_t hash() {
+    // @TODO: think of something better for constant nodes
+    return boost::hash<size_t>()((size_t)this);
+  }
+
   private:
     std::function<void(Tensor)> init_;
     bool initialized_;
@@ -117,6 +122,10 @@ struct ParamNode : public Node {
     return "orangered";
   }
 
+  virtual size_t hash() {
+    return boost::hash<size_t>()((size_t)this);
+  }
+
   private:
     std::function<void(Tensor&)> init_;
     bool initialized_;
diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h
index ba6f47bc..51d4b2f9 100644
--- a/src/graph/node_operators_binary.h
+++ b/src/graph/node_operators_binary.h
@@ -20,13 +20,8 @@ struct DotNodeOp : public NaryNodeOp {
     auto shapeA = a->shape();
     auto shapeB = b->shape();
 
-    Shape outShape;
-    if((shapeA[2] > 1 || shapeA[3] > 1) && shapeB[2] == 1 && shapeB[3] == 1)
-      outShape = {shapeA[0], shapeB[1], shapeA[2], shapeA[3]};
-    else {
-      outShape = shapeA;
-      outShape.set(1, shapeB[1]);
-    }
+    Shape outShape = shapeA;
+    outShape.set(1, shapeB[1]);
     UTIL_THROW_IF2(shapeA[1] != shapeB[0],
                  "matrix product requires dimensions to match");
     return outShape;
@@ -338,6 +333,12 @@ struct ConcatenateNodeOp : public NaryNodeOp {
     Deconcatenate(deconcatenees, adj_, ax_);
   }
 
+  virtual size_t hash() {
+    size_t seed = NaryNodeOp::hash();
+    boost::hash_combine(seed, ax_);
+    return seed;
+  }
+
   const std::string type() {
     return "concat";
   }
@@ -437,5 +438,33 @@ struct AffineNodeOp : public NaryNodeOp {
   }
 };
 
+struct LayerNormalizationOp : public NaryNodeOp {
+  LayerNormalizationOp(const std::vector<Expr>& nodes)
+    : NaryNodeOp(nodes) {}
+
+  NodeOps forwardOps() {
+    return {
+      NodeOp(
+          LayerNormalization(val_,
+                             children_[0]->val(),
+                             children_[1]->val(),
+                             (children_.size() == 3) ? children_[2]->val() : nullptr))
+      };
+  }
+
+  NodeOps backwardOps() {
+    return {
+      NodeOp(LayerNormalizationGrad(children_[0]->grad(), children_[1]->grad(), (children_.size() == 3) ? children_[2]->grad() : nullptr,
+                                    adj_, val_, children_[0]->val(), children_[1]->val(),
+                                    (children_.size() == 3) ? children_[2]->val() : nullptr))
+    };
+  }
+
+  const std::string type() {
+    return "layer_normalization";
+  }
+
+};
+
 
 }
diff --git a/src/graph/node_operators_unary.h b/src/graph/node_operators_unary.h
index f58df3bc..2da3c463 100644
--- a/src/graph/node_operators_unary.h
+++ b/src/graph/node_operators_unary.h
@@ -232,6 +232,16 @@ struct SoftmaxNodeOp : public NaryNodeOp {
     };
   }
 
+  virtual size_t hash() {
+    if(!hash_) {
+      hash_ = NaryNodeOp::hash();
+      if(mask_)
+        boost::hash_combine(hash_, mask_->hash());
+    }
+    return hash_;
+  }
+
+
   NodeOps backwardOps() {
     // For each row, the Jacobian times vector is given by:
     // J * dy = p .* (dy - avg*1)
@@ -281,9 +291,12 @@ struct LogSoftmaxNodeOp : public UnaryNodeOp {
 };
 
 struct SumNodeOp : public UnaryNodeOp {
+  int ax_;
+
   template <typename ...Args>
   SumNodeOp(Expr a, Args ...args)
-    : UnaryNodeOp(a, keywords::shape=newShape(a, args...), args...) { }
+    : UnaryNodeOp(a, keywords::shape=newShape(a, args...), args...),
+      ax_(keywords::Get(keywords::axis, -1, args...)) { }
 
   NodeOps forwardOps() {
     return { NodeOp(Reduce(_1, val_, children_[0]->val())) };
@@ -317,19 +330,31 @@ struct SumNodeOp : public UnaryNodeOp {
     return "orange";
   }
 
+  virtual size_t hash() {
+    if(!hash_) {
+      hash_ = NaryNodeOp::hash();
+      boost::hash_combine(hash_, ax_);
+    }
+    return hash_;
+  }
+
+
 };
 
 struct MeanNodeOp : public UnaryNodeOp {
+  int ax_;
+
   template <typename ...Args>
   MeanNodeOp(Expr a, Args ...args)
-    : UnaryNodeOp(a, keywords::shape=newShape(a, args...), args...) { }
+    : UnaryNodeOp(a, keywords::shape=newShape(a, args...), args...),
+      ax_(keywords::Get(keywords::axis, -1, args...)) { }
 
   NodeOps forwardOps() {
     int left = children_[0]->shape().elements() / val_->shape().elements();
     float scale = 1.f / left;
 
     return {
-      NodeOp(Reduce(_1 * scale, val_, children_[0]->val()))
+      NodeOp(Reduce(_1, val_, children_[0]->val(), scale))
     };
   }
 
@@ -338,7 +363,7 @@ struct MeanNodeOp : public UnaryNodeOp {
     float scale = 1.f / left;
 
     return {
-      NodeOp(Add(_1 * scale, children_[0]->grad(), adj_))
+      NodeOp(Add(_1, children_[0]->grad(), adj_, scale))
     };
   }
 
@@ -365,6 +390,15 @@ struct MeanNodeOp : public UnaryNodeOp {
   const std::string color() {
     return "orange";
   }
+
+  virtual size_t hash() {
+    if(!hash_) {
+      hash_ = NaryNodeOp::hash();
+      boost::hash_combine(hash_, ax_);
+    }
+    return hash_;
+  }
+
 };
 
 
@@ -423,6 +457,78 @@ struct ExpNodeOp : public UnaryNodeOp {
 
 };
 
+struct SqrtNodeOp : public UnaryNodeOp {
+  float epsilon_;
+
+  template <typename ...Args>
+    SqrtNodeOp(Expr a, float epsilon, Args ...args)
+    : UnaryNodeOp(a, args...),
+      epsilon_(epsilon) { }
+
+  NodeOps forwardOps() {
+    return {
+      NodeOp(Element(_1 = Sqrt(_2 + epsilon_),
+                     val_,
+                     children_[0]->val()))
+    };
+  }
+
+  NodeOps backwardOps() {
+    return {
+      NodeOp(Add(0.5f * (1.f / _1) * _2,
+                 children_[0]->grad(),
+                 val_,
+                 adj_))
+    };
+  }
+
+  const std::string type() {
+    return "sqrt";
+  }
+
+  virtual size_t hash() {
+    if(!hash_) {
+      size_t seed = NaryNodeOp::hash();
+      boost::hash_combine(seed, epsilon_);
+      hash_ = seed;
+    }
+    return hash_;
+  }
+
+
+};
+
+struct SquareNodeOp : public UnaryNodeOp {
+  float epsilon_;
+
+  template <typename ...Args>
+    SquareNodeOp(Args ...args)
+    : UnaryNodeOp(args...) { }
+
+  NodeOps forwardOps() {
+    return {
+      NodeOp(Element(_1 = _2 * _2,
+                     val_,
+                     children_[0]->val()))
+    };
+  }
+
+  NodeOps backwardOps() {
+    return {
+      NodeOp(Add(2.f * _1 * _2,
+                 children_[0]->grad(),
+                 children_[0]->val(),
+                 adj_))
+    };
+  }
+
+  const std::string type() {
+    return "square";
+  }
+
+};
+
+
 struct NegNodeOp : public UnaryNodeOp {
   template <typename ...Args>
   NegNodeOp(Args ...args)
@@ -489,6 +595,17 @@ struct RowsNodeOp : public UnaryNodeOp {
     return "orange";
   }
 
+  virtual size_t hash() {
+    if(!hash_) {
+      size_t seed = NaryNodeOp::hash();
+      for(auto i : indeces_)
+        boost::hash_combine(seed, i);
+      hash_ = seed;
+    }
+    return hash_;
+  }
+
+
   std::vector<size_t> indeces_;
 };
 
@@ -567,6 +684,17 @@ struct ReshapeNodeOp : public UnaryNodeOp {
   const std::string color() {
     return "grey";
   }
+
+  virtual size_t hash() {
+    if(!hash_) {
+      size_t seed = NaryNodeOp::hash();
+      for(auto s : shape())
+        boost::hash_combine(seed, s);
+      hash_ = seed;
+    }
+    return hash_;
+  }
+
 };
 
 struct TimestepNodeOp : public UnaryNodeOp {
@@ -619,6 +747,15 @@ struct TimestepNodeOp : public UnaryNodeOp {
   const std::string color() {
     return "grey";
   }
+
+  virtual size_t hash() {
+    if(!hash_) {
+       hash_ = NaryNodeOp::hash();
+       boost::hash_combine(hash_, step_);
+    }
+    return hash_;
+  }
+
 };
 
 }
diff --git a/src/kernels/dropout.cu b/src/kernels/dropout.cu
new file mode 100644
index 00000000..358f8871
--- /dev/null
+++ b/src/kernels/dropout.cu
@@ -0,0 +1,54 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "kernels/dropout.h"
+
+
+#define CUDA_CALL(x) do { if((x)!=cudaSuccess) { \
+    printf("Error at %s:%d\n",__FILE__,__LINE__);\
+    exit(1);}} while(0)
+
+#define CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
+    printf("Error at %s:%d\n",__FILE__,__LINE__);\
+    exit(1);}} while(0)
+
+namespace marian {
+
+curandGenerator_t createCurandGenerator(size_t device,
+                                        size_t seed) {
+  cudaSetDevice(device);
+  curandGenerator_t generator;
+  CURAND_CALL(curandCreateGenerator(&generator,
+                                    CURAND_RNG_PSEUDO_DEFAULT));
+  CURAND_CALL(curandSetPseudoRandomGeneratorSeed(generator, seed));
+
+  //cudaStream_t stream = 0;
+  //CURAND_CALL(curandSetStream(generator, stream));
+  //CURAND_CALL(curandDestroyGenerator(generator));
+  return generator;
+}
+
+
+__global__
+void gScale(float* data, int n, float p) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+
+  while (index < n) {
+    data[index] = (data[index] < p) / p;
+    index += gridDim.x * blockDim.x;
+  }
+}
+
+void Dropout(Tensor tensor, float p,
+             curandGenerator_t gen) {
+
+  int n = tensor->size();
+  CURAND_CALL(curandGenerateUniform(gen, tensor->data(), n));
+
+  int numThreads = std::min(n, 512);
+  int numBlocks = n / numThreads + (n % numThreads != 0);
+
+  gScale<<<numBlocks, numThreads>>>(tensor->data(), n, 1.f - p);
+}
+
+}
diff --git a/src/kernels/dropout.h b/src/kernels/dropout.h
new file mode 100644
index 00000000..90fd996f
--- /dev/null
+++ b/src/kernels/dropout.h
@@ -0,0 +1,15 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda.h>
+#include <curand.h>
+
+#include "tensors/tensor.h"
+
+namespace marian {
+
+curandGenerator_t createCurandGenerator(size_t device, size_t seed=1234);
+
+void Dropout(Tensor tensor, float h,
+             curandGenerator_t gen);
+
+}
diff --git a/src/kernels/dropout_cudnn.cu b/src/kernels/dropout_cudnn.cu
deleted file mode 100644
index f2190832..00000000
--- a/src/kernels/dropout_cudnn.cu
+++ /dev/null
@@ -1,70 +0,0 @@
-#include "dropout_cudnn.h"
-
-#include "tensors/tensor.h"
-
-namespace marian {
-
-static cudnnHandle_t create_handle_dnn() {
- cudnnHandle_t cudnnHandle;
- cudnnCreate(&cudnnHandle);
- return cudnnHandle;
-}
-
-cudnnHandle_t cudnnHandle = create_handle_dnn();
-
-void CudnnDropoutPrepare(Tensor in, float p,
-                        cudnnDropoutDescriptor_t* dropDesc,
-                        void** space, size_t* spaceSize,
-                        void** states, size_t seed) {
- size_t statesSize;
- cudnnDropoutGetStatesSize(cudnnHandle, &statesSize);
- cudnnDropoutGetReserveSpaceSize(in->cudnn(), spaceSize);
-
- cudaMalloc((void**)states, statesSize);
- cudaMalloc((void**)space, *spaceSize);
-
- cudnnCreateDropoutDescriptor(dropDesc);
- cudnnSetDropoutDescriptor(*dropDesc,
-                           cudnnHandle,
-                           p,
-                           (void*)*states,
-                           statesSize,
-                           seed);
-}
-
-void CudnnDropoutDestroy(cudnnDropoutDescriptor_t dropDesc,
-                        void* space, void* states) {
- cudnnDestroyDropoutDescriptor(dropDesc);
- cudaFree(space);
- cudaFree(states);
-}
-
-void CudnnDropoutForward(cudnnDropoutDescriptor_t dropoutDesc,
-                 void* space, size_t spaceSize,
-                 Tensor out, Tensor in) {
- cudnnDropoutForward(cudnnHandle,
-                     dropoutDesc,
-                     in->cudnn(),
-                     in->data(),
-                     out->cudnn(),
-                     out->data(),
-                     space,
-                     spaceSize);
-}
-
-/* void CudnnDropoutBackward(cudnnDropoutDescriptor_t dropoutDesc, */
-                         /* void* space, size_t spaceSize, */
-                         /* Tensor out, Tensor in) { */
- /* auto inGpu = static_cast<TensorGPU*>(in.get()); */
- /* auto outGpu = static_cast<TensorGPU*>(out.get()); */
- /* cudnnDropoutBackward(cudnnHandle, */
-                     /* dropoutDesc, */
-                     /* inGpu->cudnn(), */
-                     /* inGpu->data(), */
-                     /* outGpu->cudnn(), */
-                     /* outGpu->data(), */
-                     /* space, */
-                     /* spaceSize); */
-/* } */
-
-}
diff --git a/src/kernels/dropout_cudnn.h b/src/kernels/dropout_cudnn.h
deleted file mode 100644
index 6f51c68b..00000000
--- a/src/kernels/dropout_cudnn.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#pragma once
-#include <cudnn.h>
-
-#include "tensors/tensor.h"
-
-namespace marian {
-
-void CudnnDropoutPrepare(Tensor in, float p,
-                        cudnnDropoutDescriptor_t* dropDesc,
-                        void** space, size_t* spaceSize,
-                        void** states, size_t seed);
-
-void CudnnDropoutDestroy(cudnnDropoutDescriptor_t dropDesc,
-                        void* space, void* states);
-
-void CudnnDropoutForward(cudnnDropoutDescriptor_t dropoutDesc,
-                 void* space, size_t spaceSize,
-                 Tensor out, Tensor in);
-
-void CudnnDropoutBackward(cudnnDropoutDescriptor_t dropoutDesc,
-                         void* space, size_t spaceSize,
-                         Tensor out, Tensor in);
-
-}
diff --git a/src/kernels/tensor_operators.cu b/src/kernels/tensor_operators.cu
index da4e0efa..254c04d7 100644
--- a/src/kernels/tensor_operators.cu
+++ b/src/kernels/tensor_operators.cu
@@ -25,7 +25,7 @@
 #include "3rd_party/reduce_all.h"
 
 namespace marian {
-  
+
 
 cublasHandle_t create_handle(size_t device) {
   cudaSetDevice(device);
@@ -116,7 +116,7 @@ __global__ void gSoftmax(float* out,
                          const Shape outShape,
                          const float* in,
                          const float* mask) {
-  int rows = outShape[0];
+  int rows = outShape[0] * outShape[2] * outShape[3];
   int cols = outShape[1];
   for(int bid = 0; bid < rows; bid += gridDim.x) {
     int j = bid + blockIdx.x;
@@ -129,7 +129,7 @@ __global__ void gSoftmax(float* out,
 
       float* _max = _share + blockDim.x;
       _max[threadIdx.x] = sp[threadIdx.x]; // mask
-      for(int tid = 1; tid < cols; tid += blockDim.x) {
+      for(int tid = 0; tid < cols; tid += blockDim.x) {
         int id = tid + threadIdx.x;
         if (id < cols) {
           if (sp[id] > _max[threadIdx.x])
@@ -210,7 +210,7 @@ void Softmax(Tensor out, Tensor in, Tensor mask) {
 __global__ void gLogSoftmax(float* out,
                             const Shape outShape,
                             const float* in) {
-  int rows = outShape[0];
+  int rows = outShape[0] * outShape[2] * outShape[3];
   int cols = outShape[1];
   for(int bid = 0; bid < rows; bid += gridDim.x) {
     int j = bid + blockIdx.x;
@@ -222,7 +222,7 @@ __global__ void gLogSoftmax(float* out,
 
       float* _max = _share + blockDim.x;
       _max[threadIdx.x] = sp[threadIdx.x];
-      for(int tid = 1; tid < cols; tid += blockDim.x) {
+      for(int tid = 0; tid < cols; tid += blockDim.x) {
         int id = tid + threadIdx.x;
         if (id < cols) {
           if (sp[id] > _max[threadIdx.x]) _max[threadIdx.x] = sp[id];
@@ -277,8 +277,8 @@ __global__ void gLogSoftmax(float* out,
 
 void LogSoftmax(Tensor out, Tensor in) {
   cudaSetDevice(out->getDevice());
-  
-  size_t m = out->shape()[0];
+
+  size_t m = out->shape()[0] * out->shape()[2] * out->shape()[3];
   size_t k = out->shape()[1];
 
   int blocks = std::min(MAX_BLOCKS, (int) m);
@@ -392,12 +392,12 @@ __global__ void gLogSoftmaxGrad(float* grad, const float* adj, const float* val,
 
 void LogSoftmaxGrad(Tensor grad, Tensor adj, Tensor val) {
   cudaSetDevice(adj->getDevice());
-  
+
   // grad and val are both m-by-k matrices, passed as input.
   // A weighted average of each row of grad (according to the weights
   // specified in val) is computed and subtracted from Out.
   // adj is multiplied for each element to get backward step in autodiff
-  int m = grad->shape()[0];
+  int m = grad->shape()[0] * grad->shape()[2] * grad->shape()[3];
   int k = grad->shape()[1];
 
   int blocks = std::min(MAX_BLOCKS, m);
@@ -548,7 +548,7 @@ __global__ void gCopyRows(float* out, const float* in, size_t cols,
 
 void CopyRows(Tensor out, const Tensor in, const std::vector<size_t>& indeces) {
   cudaSetDevice(out->getDevice());
-  
+
   size_t cols = in->shape()[1];
   size_t rowsToCopy = indeces.size();
 
@@ -589,7 +589,7 @@ __global__ void gPasteRows(float* out, const float* in, size_t cols,
 
 void PasteRows(Tensor out, const Tensor in, const std::vector<size_t>& indeces) {
   cudaSetDevice(out->getDevice());
-  
+
   size_t cols = in->shape()[1];
   size_t rowsToCopy = indeces.size();
 
@@ -610,19 +610,23 @@ void PasteRows(Tensor out, const Tensor in, const std::vector<size_t>& indeces)
 
 void Transpose(cublasHandle_t cublasHandle, Tensor out, const Tensor in) {
   cudaSetDevice(out->getDevice());
-  
-  size_t m = in->shape()[0];
-  size_t n = in->shape()[1];
-  float alpha = 1.0;
-  float beta  = 0.0;
+  size_t steps = in->shape()[2] * in->shape()[3];
+  for(int i = 0; i < steps; i++) {
+    size_t m = in->shape()[0];
+    size_t n = in->shape()[1];
+    float alpha = 1.0;
+    float beta  = 0.0;
 
-  cublasSgeam(cublasHandle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, &alpha, in->data(), n,
-              &beta, in->data(), n, out->data(), m);
+    size_t offset = i * steps;
+
+    cublasSgeam(cublasHandle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, &alpha, in->data() + offset, n,
+                &beta, in->data() + offset, n, out->data() + offset, m);
+  }
 }
 
 void Concatenate0(Tensor out, const std::vector<Tensor>& inputs) {
   cudaSetDevice(out->getDevice());
-  
+
   size_t offset = 0;
   for(auto in : inputs) {
     UTIL_THROW_IF2(out->shape()[1] != in->shape()[1],
@@ -658,9 +662,9 @@ __global__ void gInsertCols(float* out, const float* in,
 // dimensions, verify this!
 void Concatenate1(Tensor out, const std::vector<Tensor>& inputs) {
   cudaSetDevice(out->getDevice());
-  
+
   size_t offset = 0;
-  int rows = out->shape()[0];
+  int rows = out->shape()[0] * out->shape()[2] * out->shape()[3];
   int cols_out = out->shape()[1];
 
   for(auto in : inputs) {
@@ -690,7 +694,7 @@ void Concatenate(Tensor out, const std::vector<Tensor>& inputs, int ax) {
 
 void Deconcatenate0(std::vector<Tensor>& outputs, const Tensor in) {
   cudaSetDevice(in->getDevice());
-  
+
   size_t offset = 0;
   for(auto out : outputs) {
     cudaMemcpy(out->data(),
@@ -703,9 +707,9 @@ void Deconcatenate0(std::vector<Tensor>& outputs, const Tensor in) {
 
 void Deconcatenate1(std::vector<Tensor>& outputs, const Tensor in) {
   cudaSetDevice(in->getDevice());
-  
+
   size_t offset = 0;
-  int rows = in->shape()[0];
+  int rows = in->shape()[0] * in->shape()[2] * in->shape()[3];
   int cols_in = in->shape()[1];
   for(auto out : outputs) {
     UTIL_THROW_IF2(out->shape()[0] != in->shape()[0],
@@ -778,8 +782,8 @@ __global__ void gGRUFastForward(float* out,
 
 void GRUFastForward(Tensor out, std::vector<Tensor> inputs, bool final){
   cudaSetDevice(out->getDevice());
-  
-  int rows = out->shape()[0];
+
+  int rows = out->shape()[0] * out->shape()[2] * out->shape()[3];
   int cols = out->shape()[1];
 
   int blocks  = std::min(MAX_BLOCKS, rows);
@@ -881,10 +885,10 @@ __global__ void gGRUFastBackward(float* outState,
 void GRUFastBackward(std::vector<Tensor> outputs,
                      std::vector<Tensor> inputs,
                      Tensor adj, bool final) {
-  
+
   cudaSetDevice(adj->getDevice());
-  
-  int rows = adj->shape()[0];
+
+  int rows = adj->shape()[0] * adj->shape()[2] * adj->shape()[3];
   int cols = adj->shape()[1];
 
   int blocks  = std::min(MAX_BLOCKS, rows);
@@ -975,7 +979,7 @@ __global__ void gCrossEntropyPick(float* out,
 
 void CrossEntropyPick(Tensor out, Tensor in, Tensor pick) {
   cudaSetDevice(out->getDevice());
-  
+
   size_t m = in->shape()[0];
   size_t k = in->shape()[1];
 
@@ -1065,7 +1069,7 @@ __global__ void gCrossEntropyPickBackward(float* out,
 
 void CrossEntropyPickBackward(Tensor out, Tensor adj, Tensor a, Tensor pick) {
   cudaSetDevice(out->getDevice());
-  
+
   size_t m = out->shape()[0];
   size_t k = out->shape()[1];
 
@@ -1082,7 +1086,7 @@ void CrossEntropyPickBackward(Tensor out, Tensor adj, Tensor a, Tensor pick) {
 
 float L2Norm(Tensor in) {
   cudaSetDevice(in->getDevice());
-  
+
   float* data;
   cudaMalloc(&data, sizeof(float));
   Tensor out(new TensorBase(data, {1, 1}, in->getDevice()));
@@ -1094,21 +1098,24 @@ float L2Norm(Tensor in) {
 }
 
 __global__ void gAtt(float* out,
-                         const float* in1,
-                         const float* in2,
-                         const float* in3,
-                         int m, // rows
-                         int k, // cols
-                         int n // rows of in2
-                         ) {
+                     const float* va,
+                     const float* ctx,
+                     const float* state,
+                     const float* cov,
+                     int m, // total rows (batch x time x beam)
+                     int k, // depth
+                     int b, // batch size
+                     int t // time of ctx
+                     ) {
   int rows = m;
   int cols = k;
   for(int bid = 0; bid < m; bid += gridDim.x) {
     int j = bid + blockIdx.x;
     if(j < rows) {
-      const float* in1Row = in1 + j * cols;
-      const float* in2Row = in2 + (j % n) * cols;
-      const float* in3Row = in3;
+      const float* vaRow = va;
+      const float* ctxRow = ctx + (j % (b * t)) * cols;
+      const float* stateRow = state + (j / (b * t) + j % b) * cols;
+      const float* covRow = cov ? cov + (j % (b * t)) * cols : nullptr;
 
       extern __shared__ float _share[];
       float* _sum = _share + blockDim.x;
@@ -1117,7 +1124,10 @@ __global__ void gAtt(float* out,
       for(int tid = 0; tid < cols; tid += blockDim.x) {
         int id = tid + threadIdx.x;
         if(id < cols) {
-          float ex = tanhf(in1Row[id] + in2Row[id]) * in3Row[id];
+          float z = ctxRow[id] + stateRow[id];
+          if(cov)
+            z += covRow[id];
+          float ex = tanhf(z) * vaRow[id];
           _sum[threadIdx.x] += ex;
         }
       }
@@ -1136,33 +1146,39 @@ __global__ void gAtt(float* out,
   }
 }
 
-void Att(Tensor out, Tensor context, Tensor state, Tensor va) {
+void Att(Tensor out,
+         Tensor va,
+         Tensor context,
+         Tensor state,
+         Tensor coverage) {
   cudaSetDevice(out->getDevice());
-  
-  size_t m = context->shape()[0] * context->shape()[2] * context->shape()[3];
-  size_t k = context->shape()[1];
 
-  size_t n = context->shape()[0];
+  size_t m = out->shape()[0] * out->shape()[2] * out->shape()[3];
+
+  size_t b = context->shape()[0];
+  size_t k = context->shape()[1];
+  size_t t = context->shape()[2];
 
   int blocks = std::min(MAX_BLOCKS, (int) m);
   int threads = std::min(MAX_THREADS, (int) k);
   int shared = sizeof(float) * threads * 2;
 
-
-
   gAtt<<<blocks, threads, shared>>>(out->data(),
+                                    va->data(),
                                     context->data(),
                                     state->data(),
-                                    va->data(),
-                                    m, k, n);
+                                    coverage ? coverage->data() : nullptr,
+                                    m, k, b, t);
 }
 
-__global__ void gAttBack(float* gContext,
+__global__ void gAttBack(float* gVa,
+                         float* gContext,
                          float* gState,
-                         float* gVa,
+                         float* gCoverage,
+                         const float* va,
                          const float* context,
                          const float* state,
-                         const float* va,
+                         const float* coverage,
                          const float* adj,
                          int m, // rows
                          int k, // cols
@@ -1175,18 +1191,26 @@ __global__ void gAttBack(float* gContext,
     if(j < rows) {
       float* gcRow = gContext + j * cols;
       float* gsRow = gState + (j % n) * cols;
+      float* gcovRow = gCoverage ? gCoverage + j * cols : nullptr;
 
       const float* cRow = context + j * cols;
       const float* sRow = state + (j % n) * cols;
+      const float* covRow = coverage ? coverage + j * cols : nullptr;
 
       for(int tid = 0; tid < cols; tid += blockDim.x) {
         int id = tid + threadIdx.x;
         if(id < cols) {
-          float t = tanhf(cRow[id] + sRow[id]);
+          float z = cRow[id] + sRow[id];
+          if(coverage)
+            z += covRow[id];
+
+          float t = tanhf(z);
           float r = va[id] * (1.f - t * t);
 
           gcRow[id] += r * adj[j];
           gsRow[id] += r * adj[j];
+          if(gCoverage)
+            gcovRow[id] += r * adj[j];
           atomicAdd(gVa + id, t * adj[j]);
         }
       }
@@ -1195,11 +1219,11 @@ __global__ void gAttBack(float* gContext,
 }
 
 
-void AttBack(Tensor gContext, Tensor gState, Tensor gVa,
-             Tensor context, Tensor state, Tensor va,
+void AttBack(Tensor gVa, Tensor gContext, Tensor gState, Tensor gCoverage,
+             Tensor va, Tensor context, Tensor state, Tensor coverage,
              Tensor adj) {
   cudaSetDevice(adj->getDevice());
-             
+
   size_t m = context->shape()[0] * context->shape()[2] * context->shape()[3];
   size_t k = context->shape()[1];
 
@@ -1208,16 +1232,208 @@ void AttBack(Tensor gContext, Tensor gState, Tensor gVa,
   int blocks = std::min(MAX_BLOCKS, (int) n);
   int threads = std::min(MAX_THREADS, (int) k);
 
-  gAttBack<<<blocks, threads>>>(gContext->data(),
+  gAttBack<<<blocks, threads>>>(gVa->data(),
+                                gContext->data(),
                                 gState->data(),
-                                gVa->data(),
+                                gCoverage ? gCoverage->data() : nullptr,
 
+                                va->data(),
                                 context->data(),
                                 state->data(),
-                                va->data(),
+                                coverage ? coverage->data() : nullptr,
 
                                 adj->data(),
                                 m, k, n);
 }
 
+__global__ void gLNormalization(float* out, const float* in, const float* alpha, const float* beta,
+                                int rows, int cols, float eps=1e-9) {
+  extern __shared__ float _share[];
+
+  for (int bid = 0; bid < rows; bid += gridDim.x) {
+    int j = bid + blockIdx.x;
+    if (j < rows) {
+      float* so = out + j * cols;
+      const float* sp = in + j * cols;
+
+      float* _sum = _share + blockDim.x;
+      _sum[threadIdx.x] = 0.0f;
+      for (int tid = 0; tid < cols; tid += blockDim.x) {
+        int id = tid + threadIdx.x;
+        if (id < cols) {
+          _sum[threadIdx.x] += sp[id];
+        }
+      }
+      __syncthreads();
+      int len = blockDim.x;
+      while(len != 1) {
+        __syncthreads();
+        int skip = (len + 1) >> 1;
+        if (threadIdx.x < (len >> 1)) {
+             _sum[threadIdx.x] += _sum[threadIdx.x + skip];
+        }
+        len = (len + 1) >> 1;
+      }
+      __syncthreads();
+      float mean = _sum[0] / cols;
+      __syncthreads();
+
+      float* _sqSum = _share + blockDim.x;
+
+      _sqSum[threadIdx.x] = 0.0;
+      for (int tid = 0; tid < cols; tid += blockDim.x) {
+        int id = tid + threadIdx.x;
+        if(id < cols) {
+          float ex = sp[id] - mean;
+          _sqSum[threadIdx.x] += ex * ex;
+        }
+      }
+      __syncthreads();
+      len = blockDim.x;
+      while(len != 1) {
+        __syncthreads();
+        int skip = (len + 1) >> 1;
+        if(threadIdx.x < (len >> 1))
+          _sqSum[threadIdx.x] += _sqSum[threadIdx.x + skip];
+        len = (len + 1) >> 1;
+      }
+      __syncthreads();
+      float sigma = sqrtf(eps + (_sqSum[0] / cols));
+      __syncthreads();
+
+      for (int tid = 0; tid < cols; tid += blockDim.x) {
+        int id = tid + threadIdx.x;
+        if(id < cols) {
+          float t = alpha[id] * ((sp[id] - mean) / sigma);
+          if (beta != nullptr)
+            t += beta[id];
+          so[id] = t;
+        }
+      }
+    }
+  }
 }
+
+void LayerNormalization(Tensor out, Tensor in, Tensor gamma, Tensor beta, float eps) {
+  cudaSetDevice(out->getDevice());
+
+  int rows = in->shape()[0] * in->shape()[2] * in->shape()[3];
+  int cols = in->shape()[1];
+
+  int blocks = std::min(MAX_BLOCKS, (int)rows);
+  int threads = std::min(MAX_THREADS, (int)cols);
+  int shared = 2 * threads * sizeof(float);
+
+  gLNormalization<<<blocks, threads, shared>>>(out->data(),
+                                               in->data(),
+                                               gamma->data(),
+                                               beta ? beta->data() : nullptr,
+                                               rows, cols, eps);
+}
+
+__global__ void gLayerNormalizationGrad(float* gradX, float* gradGamma, float* gradBeta,
+                                        float* adj, float* y, float* x, float* gamma, float* beta,
+                                        int rows, int cols, float eps=1e-9) {
+  extern __shared__ float shared[];
+
+  for (int bid = 0; bid < rows; bid += gridDim.x) {
+    int j = bid + blockIdx.x;
+    if (j < rows) {
+      float* sum_adj = shared;
+      float* sum_adj_x = shared + blockDim.x;
+      float* sum_x = shared + 2 * blockDim.x;
+      float* sum_sqr = shared + 3 * blockDim.x;
+
+      const float* xRow = x + j * cols;
+      const float* yRow = y + j * cols;
+      const float* adjRow = adj + j * cols;
+      float* gradXRow = gradX + j * cols;
+
+      sum_x[threadIdx.x] = 0.0f;
+      sum_adj[threadIdx.x] = 0.0f;
+      sum_adj_x[threadIdx.x] = 0.0f;
+      sum_sqr[threadIdx.x] = 0.0f;
+
+      for (int tid = 0; tid < cols; tid += blockDim.x) {
+        int id = tid + threadIdx.x;
+        if (id < cols) {
+          sum_x[threadIdx.x] += xRow[id];
+          sum_adj_x[threadIdx.x] += adjRow[id] * (yRow[id] - ((beta) ? beta[id] : 0)) / gamma[id];
+          sum_adj[threadIdx.x] += adjRow[id];
+        }
+      }
+      __syncthreads();
+      int len = blockDim.x;
+      while(len != 1) {
+        __syncthreads();
+        int skip = (len + 1) >> 1;
+        if (threadIdx.x < (len >> 1)) {
+             sum_x[threadIdx.x] += sum_x[threadIdx.x + skip];
+             sum_adj[threadIdx.x] += sum_adj[threadIdx.x + skip];
+             sum_adj_x[threadIdx.x] += sum_adj_x[threadIdx.x + skip];
+        }
+        len = (len + 1) >> 1;
+      }
+      __syncthreads();
+      float mean = sum_x[0] / cols;
+      __syncthreads();
+
+      for (int tid = 0; tid < cols; tid += blockDim.x) {
+        int id = tid + threadIdx.x;
+        if(id < cols) {
+          float ex = xRow[id] - mean;
+          sum_sqr[threadIdx.x] += ex * ex;
+        }
+      }
+
+      __syncthreads();
+      len = blockDim.x;
+      while(len != 1) {
+        __syncthreads();
+        int skip = (len + 1) >> 1;
+        if(threadIdx.x < (len >> 1))
+          sum_sqr[threadIdx.x] += sum_sqr[threadIdx.x + skip];
+        len = (len + 1) >> 1;
+      }
+      __syncthreads();
+      float sigma = sqrtf(eps + (sum_sqr[0] / cols));
+      __syncthreads();
+
+
+      for (int tid = 0; tid < cols; tid += blockDim.x) {
+        int id = tid + threadIdx.x;
+        if (id < cols) {
+          float grad_x = 0.0f;
+          float x_hat = (yRow[id] - ((beta) ? beta[id] : 0) )  / gamma[id];
+          grad_x += cols * adjRow[id];
+          grad_x -= sum_adj[0];
+          grad_x -= sum_adj_x[0] * x_hat;
+          grad_x /= (cols * sigma);
+
+          gradXRow[id] += gamma[id] * grad_x;
+          atomicAdd(gradGamma + id, adjRow[id] * x_hat);
+          if (beta) {
+            atomicAdd(gradBeta + id, adjRow[id]);
+          }
+        }
+      }
+    }
+  }
+}
+
+void LayerNormalizationGrad(Tensor gradX, Tensor gradGamma, Tensor gradBeta,
+                            Tensor adj, Tensor y, Tensor x, Tensor gamma, Tensor beta) {
+  cudaSetDevice(adj->getDevice());
+  int rows = y->shape()[0] * y->shape()[2] * y->shape()[3];
+  int cols = y->shape()[1];
+
+  int threads = std::min(MAX_THREADS, cols);
+  int blocks = std::min(MAX_BLOCKS, rows);
+  int shared = sizeof(float) * threads * 4;
+
+  gLayerNormalizationGrad<<<blocks, threads, shared>>>
+    (gradX->data(), gradGamma->data(), (gradBeta) ? gradBeta->data() : nullptr,
+     adj->data(), y->data(), x->data(), gamma->data(),(beta) ?  beta->data() : nullptr, rows, cols);
+}
+
+}  // namespace marian
diff --git a/src/kernels/tensor_operators.h b/src/kernels/tensor_operators.h
index d1adfde9..99c0586d 100644
--- a/src/kernels/tensor_operators.h
+++ b/src/kernels/tensor_operators.h
@@ -45,8 +45,9 @@ __global__ void gAdd(Functor functor,
                      Shape outShape,
                      const float* in1,
                      const Shape in1Shape,
-                     const Shape full) {
-  
+                     const Shape full,
+                     float scale = 1.0) {
+
   int outLength = outShape.elements();
   bool same = outLength == full.elements() && outLength == in1Shape.elements();
 
@@ -61,7 +62,7 @@ __global__ void gAdd(Functor functor,
     int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
     if(index < outLength) {
       if(same) {
-        out[index] += functor(in1[index]);
+        out[index] += functor(in1[index]) * scale;
       }
       else {
         outShape.dims(index, dims);
@@ -83,15 +84,75 @@ __global__ void gAdd(Functor functor,
           }
         }
         if(sum)
-          out[index] += sum;
+          out[index] += sum * scale;
       }
     }
   }
 }
 
+template <class Functor>
+__global__ void gAdd1(Functor functor,
+                      float* out,
+                      Shape outShape,
+                      const float* in1,
+                      const Shape in1Shape,
+                      const Shape full,
+                      float scale = 1.0) {
+
+  int rows = full[0] * full[2] * full[3];
+  int cols = full[1];
+  bool same = in1Shape.elements() == full.elements();
+
+  for(int bid = 0; bid < rows; bid += gridDim.x) {
+    int j = bid + blockIdx.x;
+    if(j < rows) {
+      extern __shared__ float _share[];
+      float* _sum = _share + blockDim.x;
+
+      if(same) {
+        const float* sp = in1 + j * cols;
+        _sum[threadIdx.x] = 0;
+        for(int tid = 0; tid < cols; tid += blockDim.x) {
+          int id = tid + threadIdx.x;
+          if (id < cols) {
+            _sum[threadIdx.x] += functor(sp[id]);
+          }
+        }
+      }
+      else {
+        int dims[4];
+        _sum[threadIdx.x] = 0;
+
+        for(int tid = 0; tid < cols; tid += blockDim.x) {
+          int id = tid + threadIdx.x;
+          if (id < cols) {
+            full.dims(j * cols + id, dims);
+            int in1Index = in1Shape.bindex(dims);
+            _sum[threadIdx.x] += functor(in1[in1Index]);
+          }
+        }
+      }
+      __syncthreads();
+      int len = blockDim.x;
+      while(len != 1) {
+        __syncthreads();
+        int skip = (len + 1) >> 1;
+        if (threadIdx.x < (len >> 1)) {
+          _sum[threadIdx.x] += _sum[threadIdx.x + skip];
+
+        }
+        len = (len + 1) >> 1;
+      }
+      __syncthreads();
+      out[j] += _sum[0] * scale;
+    }
+  }
+}
+
+
 template <class Functor>
 void Add(Functor functor,
-         Tensor out, Tensor in) {
+         Tensor out, Tensor in, float scale = 1.0) {
 
   cudaSetDevice(out->getDevice());
 
@@ -101,20 +162,36 @@ void Add(Functor functor,
 
   int length = out->shape().elements();
 
-  int threads = std::min(MAX_THREADS, length);
-  int blocks  = std::min(MAX_BLOCKS, length / threads  + (length % threads != 0));
+  if(full.elements() / length == full[1]) {
+    size_t m = full.elements() / length;
+    size_t k = full[1];
 
-  gAdd<<<blocks, threads>>>(functor,
-                            out->data(), out->shape(),
-                            in->data(), in->shape(),
-                            full);
+    int blocks = std::min(MAX_BLOCKS, (int) m);
+    int threads = std::min(MAX_THREADS, (int) k);
+    int shared = sizeof(float) * threads * 2;
+
+    gAdd1<<<blocks, threads, shared>>>(functor,
+                                       out->data(), out->shape(),
+                                       in->data(), in->shape(),
+                                       full, scale);
+  }
+  else {
+
+    int threads = std::min(MAX_THREADS, length);
+    int blocks  = std::min(MAX_BLOCKS, length / threads  + (length % threads != 0));
+
+    gAdd<<<blocks, threads>>>(functor,
+                              out->data(), out->shape(),
+                              in->data(), in->shape(),
+                              full, scale);
+  }
 }
 
 template <class Functor, class T1, class T2>
 void Reduce(Functor functor,
-         T1 out, T2 in) {
+         T1 out, T2 in, float scale = 1.0) {
   out->set(0);
-  Add(functor, out, in);
+  Add(functor, out, in, scale);
 }
 
 template <class Functor>
@@ -125,7 +202,8 @@ __global__ void gAdd(Functor functor,
                      const Shape in1Shape,
                      const float* in2,
                      const Shape in2Shape,
-                     const Shape full) {
+                     const Shape full,
+                     float scale = 1.0) {
 
   int outLength = outShape.elements();
 
@@ -144,7 +222,7 @@ __global__ void gAdd(Functor functor,
     int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
     if (index < outLength) {
       if(same) {
-        out[index] += functor(in1[index], in2[index]);
+        out[index] += functor(in1[index], in2[index]) * scale;
       }
       else {
         outShape.dims(index, dims);
@@ -166,15 +244,80 @@ __global__ void gAdd(Functor functor,
           }
         }
         if(sum)
-          out[index] += sum;
+          out[index] += sum * scale;
       }
     }
   }
 }
 
+template <class Functor>
+__global__ void gAdd1(Functor functor,
+                      float* out,
+                      Shape outShape,
+                      const float* in1,
+                      const Shape in1Shape,
+                      const float* in2,
+                      const Shape in2Shape,
+                      const Shape full,
+                      float scale = 1.0) {
+
+  int rows = full[0] * full[2] * full[3];
+  int cols = full[1];
+  bool same = in1Shape.elements() == full.elements()
+    && in2Shape.elements() == full.elements();
+
+  for(int bid = 0; bid < rows; bid += gridDim.x) {
+    int j = bid + blockIdx.x;
+    if(j < rows) {
+      extern __shared__ float _share[];
+      float* _sum = _share + blockDim.x;
+
+      if(same) {
+        const float* sp1 = in1 + j * cols;
+        const float* sp2 = in2 + j * cols;
+        _sum[threadIdx.x] = 0;
+        for(int tid = 0; tid < cols; tid += blockDim.x) {
+          int id = tid + threadIdx.x;
+          if (id < cols) {
+            _sum[threadIdx.x] += functor(sp1[id], sp2[id]);
+          }
+        }
+      }
+      else {
+        int dims[4];
+        _sum[threadIdx.x] = 0;
+
+        for(int tid = 0; tid < cols; tid += blockDim.x) {
+          int id = tid + threadIdx.x;
+          if (id < cols) {
+            full.dims(j * cols + id, dims);
+            int in1Index = in1Shape.bindex(dims);
+            int in2Index = in2Shape.bindex(dims);
+            _sum[threadIdx.x] += functor(in1[in1Index], in2[in2Index]);
+          }
+        }
+      }
+      __syncthreads();
+      int len = blockDim.x;
+      while(len != 1) {
+        __syncthreads();
+        int skip = (len + 1) >> 1;
+        if (threadIdx.x < (len >> 1)) {
+          _sum[threadIdx.x] += _sum[threadIdx.x + skip];
+
+        }
+        len = (len + 1) >> 1;
+      }
+      __syncthreads();
+      out[j] += _sum[0] * scale;
+    }
+  }
+}
+
+
 template <class Functor>
 void Add(Functor functor,
-         Tensor out, Tensor in1, Tensor in2) {
+         Tensor out, Tensor in1, Tensor in2, float scale = 1.0) {
 
   cudaSetDevice(out->getDevice());
 
@@ -186,23 +329,39 @@ void Add(Functor functor,
 
   int length = out->shape().elements();
 
-  int threads = std::min(MAX_THREADS, length);
-  int blocks  = std::min(MAX_BLOCKS, length / threads  + (length % threads != 0));
+  /*
+  if(full.elements() / length == full[1]) {
+    size_t m = full.elements() / length;
+    size_t k = full[1];
 
-  gAdd<<<blocks, threads>>>(functor,
-                            out->data(), out->shape(),
-                            in1->data(), in1->shape(),
-                            in2->data(), in2->shape(),
-                            full);
+    int blocks = std::min(MAX_BLOCKS, (int) m);
+    int threads = std::min(MAX_THREADS, (int) k);
+    int shared = sizeof(float) * threads * 2;
 
+    gAdd1<<<blocks, threads, shared>>>(functor,
+                                       out->data(), out->shape(),
+                                       in1->data(), in1->shape(),
+                                       in2->data(), in2->shape(),
+                                       full);
+  }
+  else {*/
+    int threads = std::min(MAX_THREADS, length);
+    int blocks  = std::min(MAX_BLOCKS, length / threads  + (length % threads != 0));
+
+    gAdd<<<blocks, threads>>>(functor,
+                              out->data(), out->shape(),
+                              in1->data(), in1->shape(),
+                              in2->data(), in2->shape(),
+                              full, scale);
+  //}
 }
 
 template <class Functor>
 void Reduce(Functor functor,
-            Tensor out, Tensor in1, Tensor in2) {
+            Tensor out, Tensor in1, Tensor in2, float scale = 1.0) {
 
   out->set(0);
-  Add(functor, out, in1, in2);
+  Add(functor, out, in1, in2, scale);
 }
 
 
@@ -680,9 +839,13 @@ void GRUFastBackward(std::vector<Tensor> outputs,
                      std::vector<Tensor> inputs,
                      Tensor adj, bool final = false);
 
-void Att(Tensor out, Tensor context, Tensor state, Tensor va);
-void AttBack(Tensor gContext, Tensor gState, Tensor gva,
-             Tensor context, Tensor state, Tensor va,
+void Att(Tensor out, Tensor va, Tensor context, Tensor state, Tensor coverage);
+void AttBack(Tensor gva, Tensor gContext, Tensor gState, Tensor gCoverage,
+             Tensor va, Tensor context, Tensor state, Tensor coverage,
              Tensor adj);
 
+void LayerNormalization(Tensor out, Tensor in, Tensor gamma, Tensor beta, float eps=1e-9);
+void LayerNormalizationGrad(Tensor gradX, Tensor gradGamma, Tensor gradBeta,
+                            Tensor adj, Tensor y, Tensor x, Tensor gamma, Tensor beta);
+
 }
diff --git a/src/kernels/thrust_functions.h b/src/kernels/thrust_functions.h
index a48a7f71..e0529364 100644
--- a/src/kernels/thrust_functions.h
+++ b/src/kernels/thrust_functions.h
@@ -48,6 +48,7 @@ namespace thrust
         return compose(unary_operator<unary_exp>(), _1);
       }
 
+
       template<typename T>
       struct unary_log : public thrust::unary_function<T,T> {
         __host__ __device__
@@ -166,6 +167,33 @@ namespace thrust
                        make_actor(_1),
                        make_actor(_2));
       }
+
+      template<typename T>
+      struct binary_pow : public thrust::binary_function<T, T, T> {
+        __host__ __device__
+        T operator()(const T &x, const T &y) const {
+          float tx = x;
+          if(y == (int)y && (int)y % 2 == 0)
+            tx = abs(x);
+          return powf(tx, y);
+        }
+      };
+
+      template<typename T1, typename T2>
+      __host__ __device__
+      actor<
+        composite<
+          binary_operator<binary_pow>,
+          actor<T1>,
+          typename as_actor<T2>::type
+        >
+      >
+      Pow(const actor<T1> &_1, const T2 &_2)
+      {
+        return compose(binary_operator<binary_pow>(),
+                       make_actor(_1),
+                       make_actor(_2));
+      }
     }
   }
 }
diff --git a/src/layers/attention.h b/src/layers/attention.h
index 6e52c5fd..1f55a5ac 100644
--- a/src/layers/attention.h
+++ b/src/layers/attention.h
@@ -2,6 +2,7 @@
 
 #include "marian.h"
 #include "graph/expression_graph.h"
+#include "layers/rnn.h"
 
 namespace marian {
 
@@ -13,17 +14,19 @@ struct AttentionNodeOp : public NaryNodeOp {
                  keywords::shape=newShape(nodes)) {}
 
   Shape newShape(const std::vector<Expr>& nodes) {
-    Shape shape = nodes[0]->shape();
-    Shape shape2 = nodes[1]->shape();
-    Shape shape3 = nodes[2]->shape();
+    Shape shape = nodes[1]->shape();
 
-    for(int i = 0; i < shape2.size(); ++i) {
-      UTIL_THROW_IF2(shape[i] != shape2[i] && shape[i] != 1 && shape2[i] != 1,
+    Shape vaShape =  nodes[0]->shape();
+    Shape ctxShape = nodes[1]->shape();
+    Shape stateShape = nodes[2]->shape();
+
+    for(int i = 0; i < stateShape.size(); ++i) {
+      UTIL_THROW_IF2(ctxShape[i] != stateShape[i] && ctxShape[i] != 1 && stateShape[i] != 1,
                      "Shapes cannot be broadcasted");
-      shape.set(i, std::max(shape[i], shape2[i]));
+      shape.set(i, std::max(ctxShape[i], stateShape[i]));
     }
 
-    UTIL_THROW_IF2(shape3[0] != shape[1] || shape3[1] != 1,
+    UTIL_THROW_IF2(vaShape[0] != shape[1] || vaShape[1] != 1,
                    "Wrong size");
 
     shape.set(1, 1);
@@ -35,7 +38,8 @@ struct AttentionNodeOp : public NaryNodeOp {
       NodeOp(Att(val_,
                  children_[0]->val(),
                  children_[1]->val(),
-                 children_[2]->val()))
+                 children_[2]->val(),
+                 children_.size() == 4 ? children_[3]->val() : nullptr))
     };
   }
 
@@ -46,9 +50,11 @@ struct AttentionNodeOp : public NaryNodeOp {
           children_[0]->grad(),
           children_[1]->grad(),
           children_[2]->grad(),
+          children_.size() == 4 ? children_[3]->grad() : nullptr,
           children_[0]->val(),
           children_[1]->val(),
           children_[2]->val(),
+          children_.size() == 4 ? children_[3]->val() : nullptr,
           adj_
         );
       )
@@ -70,22 +76,33 @@ struct AttentionNodeOp : public NaryNodeOp {
   }
 };
 
-Expr attOps(Expr context, Expr state, Expr va) {
-  std::vector<Expr> nodes{context, state, va};
+Expr attOps(Expr va, Expr context, Expr state, Expr coverage=nullptr) {
+  std::vector<Expr> nodes{va, context, state};
+  if(coverage)
+    nodes.push_back(coverage);
+
   int dimBatch = context->shape()[0];
   int dimWords = context->shape()[2];
+  int dimBeam  = state->shape()[3];
   return reshape(Expression<AttentionNodeOp>(nodes),
-                 {dimWords, dimBatch});
+                 {dimWords, dimBatch, 1, dimBeam});
 }
 
 class GlobalAttention {
   private:
     Expr Wa_, ba_, Ua_, va_;
 
+    Expr gammaContext_, betaContext_;
+    Expr gammaState_, betaState_;
+
     Expr context_;
     Expr softmaxMask_;
     Expr mappedContext_;
     std::vector<Expr> contexts_;
+    std::vector<Expr> alignments_;
+    bool layerNorm_;
+
+    Expr cov_;
 
   public:
 
@@ -95,21 +112,34 @@ class GlobalAttention {
               int dimDecState,
               Args ...args)
      : context_(context),
-       softmaxMask_(nullptr) {
+       softmaxMask_(nullptr),
+       layerNorm_(Get(keywords::normalize, false, args...)),
+       cov_(Get(keywords::coverage, nullptr, args...)) {
 
       int dimEncState = context->shape()[1];
+
       auto graph = context->graph();
 
       Wa_ = graph->param(prefix + "_W_comb_att", {dimDecState, dimEncState},
                          keywords::init=inits::glorot_uniform);
-      ba_ = graph->param(prefix + "_b_att", {1, dimEncState},
-                         keywords::init=inits::zeros);
       Ua_ = graph->param(prefix + "_Wc_att", {dimEncState, dimEncState},
                          keywords::init=inits::glorot_uniform);
       va_ = graph->param(prefix + "_U_att", {dimEncState, 1},
                          keywords::init=inits::glorot_uniform);
+      ba_ = graph->param(prefix + "_b_att", {1, dimEncState},
+                         keywords::init=inits::zeros);
 
-      mappedContext_ = affine(context_, Ua_, ba_);
+      if(layerNorm_) {
+        gammaContext_ = graph->param(prefix + "_att_gamma1", {1, dimEncState},
+                                     keywords::init=inits::from_value(1.0));
+        gammaState_ = graph->param(prefix + "_att_gamma2", {1, dimEncState},
+                                   keywords::init=inits::from_value(1.0));
+
+        mappedContext_ = layer_norm(dot(context_, Ua_), gammaContext_, ba_);
+      }
+      else {
+        mappedContext_ = affine(context_, Ua_, ba_);
+      }
 
       auto softmaxMask = Get(keywords::mask, nullptr, args...);
       if(softmaxMask) {
@@ -124,21 +154,23 @@ class GlobalAttention {
 
       int dimBatch = context_->shape()[0];
       int srcWords = context_->shape()[2];
+      int dimBeam  = state->shape()[3];
 
       auto mappedState = dot(state, Wa_);
-      auto attReduce = attOps(mappedContext_, mappedState, va_);
+      if(layerNorm_)
+        mappedState = layer_norm(mappedState, gammaState_);
+
+      auto attReduce = attOps(va_, mappedContext_, mappedState);
 
       // @TODO: horrible ->
-      auto e = reshape(
-        transpose(softmax(transpose(attReduce),
-                          softmaxMask_)),
-        {dimBatch, 1, srcWords});
+      auto e = reshape(transpose(softmax(transpose(attReduce), softmaxMask_)),
+                       {dimBatch, 1, srcWords, dimBeam});
       // <- horrible
 
-      auto alignedSource = weighted_average(context_, e,
-                                            axis=2);
+      auto alignedSource = weighted_average(context_, e, axis=2);
 
       contexts_.push_back(alignedSource);
+      alignments_.push_back(e);
       return alignedSource;
     }
 
diff --git a/src/layers/dropout.h b/src/layers/dropout.h
deleted file mode 100644
index aedfa287..00000000
--- a/src/layers/dropout.h
+++ /dev/null
@@ -1,60 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <cuda.h>
-#include <curand.h>
-
-#include "tensors/tensor.h"
-
-
-#define CUDA_CALL(x) do { if((x)!=cudaSuccess) { \
-    printf("Error at %s:%d\n",__FILE__,__LINE__);\
-    exit(1);}} while(0)
-
-#define CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
-    printf("Error at %s:%d\n",__FILE__,__LINE__);\
-    exit(1);}} while(0)
-
-
-__global__
-void gScalled(float* data, int n, float p) {
-  int index = threadIdx.x + blockIdx.x * blockDim.x;
-
-  while (index < n) {
-    data[index] = (data[index] < p) / p;
-    index += gridDim.x * blockDim.x;
-  }
-}
-
-namespace marian {
-
-class DropoutGenerator {
-  public:
-    DropoutGenerator(cudaStream_t stream=0, unsigned long long seed = 1234ULL) {
-      CURAND_CALL(curandCreateGenerator(&generator_, CURAND_RNG_PSEUDO_DEFAULT));
-      CURAND_CALL(curandSetPseudoRandomGeneratorSeed(generator_, seed));
-      CURAND_CALL(curandSetStream(generator_, stream));
-    }
-
-    void Generate(Tensor& tensor, float p) {
-      Generate(tensor->data(), tensor->size(), p);
-    }
-
-
-    void Generate(float* data, int n, float p) {
-      CURAND_CALL(curandGenerateUniform(generator_, data, n));
-      int numThreads = std::min(n, 512);
-      int numBlocks = n / numThreads + (n % numThreads != 0);
-
-      gScalled<<<numBlocks, numThreads>>>(data, n, p);
-    }
-
-    ~DropoutGenerator() {
-      CURAND_CALL(curandDestroyGenerator(generator_));
-    }
-
-  private:
-    curandGenerator_t generator_;
-
-};
-
-}
diff --git a/src/layers/generic.h b/src/layers/generic.h
index 65eea110..1d0256d4 100644
--- a/src/layers/generic.h
+++ b/src/layers/generic.h
@@ -28,6 +28,7 @@ namespace marian {
     private:
       int outDim_;
       act activation_;
+      bool layerNorm_;
 
     public:
       template <class ...Args>
@@ -38,18 +39,30 @@ namespace marian {
          outDim_(outDim),
          activation_(Get(keywords::activation,
                          act::linear,
-                         args...)) {}
+                         args...)),
+         layerNorm_(Get(keywords::normalize,
+                        false, args...)) {}
 
       Expr operator()(Expr in) {
         auto g = in->graph();
         auto W = g->param(name_ + "_W", {in->shape()[1], outDim_},
                           keywords::init=inits::glorot_uniform);
         auto b = g->param(name_ + "_b", {1, outDim_},
-                          keywords::init=inits::zeros);
+                            keywords::init=inits::zeros);
 
         params_ = { W, b };
 
-        auto out = affine(in, W, b);
+        Expr out;
+        if(layerNorm_) {
+          auto gamma = g->param(name_ + "_gamma", {1, outDim_},
+                                keywords::init=inits::from_value(1.0));
+
+          params_.push_back(gamma);
+          out = layer_norm(dot(in, W), gamma, b);
+        }
+        else {
+          out = affine(in, W, b);
+        }
 
         switch (activation_) {
           case act::linear :
@@ -81,13 +94,21 @@ namespace marian {
                             {in->shape()[1], outDim_},
                             keywords::init=inits::glorot_uniform);
           auto b = g->param(name_ + "_b" + std::to_string(i),
-                            {1, outDim_},
-                            keywords::init=inits::zeros);
-
+                              {1, outDim_},
+                              keywords::init=inits::zeros);
           params_.push_back(W);
           params_.push_back(b);
 
-          outputs.push_back(affine(in, W, b));
+          if(layerNorm_) {
+            auto gamma = g->param(name_ + "_gamma" + std::to_string(i), {1, outDim_},
+                                  keywords::init=inits::from_value(1.0));
+
+            params_.push_back(gamma);
+            outputs.push_back(layer_norm(dot(in, W), gamma, b));
+          }
+          else {
+            outputs.push_back(affine(in, W, b));
+          }
           i++;
         }
 
@@ -145,7 +166,7 @@ namespace marian {
         auto mask = Get(keywords::mask, nullptr, args...);
 
         auto ce = cross_entropy(in, picks);
-
+        
         if(mask)
           ce = ce * mask;
 
diff --git a/src/layers/param_initializers.cpp b/src/layers/param_initializers.cpp
index 64f4936b..08ca9bc9 100644
--- a/src/layers/param_initializers.cpp
+++ b/src/layers/param_initializers.cpp
@@ -1,23 +1,3 @@
-// This file is part of the Marian toolkit.
-// Marian is copyright (c) 2016 Marcin Junczys-Dowmunt.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-// SOFTWARE.
 
 #include <random>
 #include <algorithm>
@@ -44,9 +24,6 @@ float xor128() {
     return 0.1 * ((w % 1000) / 1000.f) - 0.05;
 }
 
-// Use a constant seed for deterministic behaviour.
-//std::default_random_engine engine(42);
-
 void zeros(Tensor t) {
   t->set(0.f);
 }
diff --git a/src/layers/param_initializers.h b/src/layers/param_initializers.h
index 51b45944..f1fab867 100644
--- a/src/layers/param_initializers.h
+++ b/src/layers/param_initializers.h
@@ -1,26 +1,5 @@
 #pragma once
 
-// This file is part of the Marian toolkit.
-// Marian is copyright (c) 2016 Marcin Junczys-Dowmunt.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-// SOFTWARE.
-
 #include <functional>
 #include <random>
 #include "tensors/tensor.h"
@@ -47,6 +26,7 @@ template <class Distribution>
 void distribution(std::vector<float>& vals, float a, float b) {
   std::random_device device;
   std::default_random_engine engine(device());
+  engine.seed(1234);
   Distribution dist(a, b);
   auto gen = std::bind(dist, engine);
 
diff --git a/src/layers/rnn.h b/src/layers/rnn.h
index cff95028..0635748e 100644
--- a/src/layers/rnn.h
+++ b/src/layers/rnn.h
@@ -1,3 +1,5 @@
+#pragma once
+
 #include <algorithm>
 #include <chrono>
 #include <iomanip>
@@ -10,7 +12,6 @@
 #include "graph/expression_graph.h"
 
 #include "layers/generic.h"
-#include "layers/attention.h"
 
 namespace marian {
 
@@ -56,6 +57,7 @@ class Tanh {
 template <class Cell>
 class RNN : public Layer {
   public:
+    int dimInput_;
     int dimState_;
     dir direction_;
     bool outputLast_;
@@ -63,15 +65,17 @@ class RNN : public Layer {
     Ptr<Cell> cell_;
 
     template <typename ...Args>
-    RNN(const std::string& name,
-         int dimState,
-         Cell cell,
-         Args ...args)
+    RNN(Ptr<ExpressionGraph> graph,
+        const std::string& name,
+        int dimInput,
+        int dimState,
+        Args ...args)
     : Layer(name),
+      dimInput_{dimInput},
       dimState_{dimState},
       direction_{Get(keywords::direction, dir::forward, args...)},
       outputLast_{Get(keywords::output_last, false, args...)},
-      cell_(New<Cell>(cell)) {}
+      cell_(New<Cell>(graph, name_, dimInput_, dimState_, args...)) {}
 
     Ptr<Cell> getCell() {
       return cell_;
@@ -80,7 +84,6 @@ class RNN : public Layer {
     std::vector<Expr> apply(const Expr input, const Expr initialState,
                             const Expr mask = nullptr, bool reverse = false) {
       auto xW = cell_->apply1(input);
-
       std::vector<Expr> outputs;
       auto state = initialState;
       for(size_t i = 0; i < input->shape()[2]; ++i) {
@@ -114,13 +117,11 @@ class RNN : public Layer {
       auto graph = input->graph();
       int dimInput = input->shape()[1];
 
-      cell_->initialize(graph, name_, dimInput, dimState_, args...);
-
       Expr mask = Get(keywords::mask, nullptr, args...);
 
       if(direction_ == dir::backward) {
         auto states = apply(input, state, mask, true);
-        //std::reverse(states.begin(), states.end());
+        std::reverse(states.begin(), states.end());
         if(outputLast_)
           return states.back();
         else
@@ -139,63 +140,130 @@ class RNN : public Layer {
     }
 };
 
+template <class Cell>
+class MLRNN : public Layer {
+  private:
+    int layers_;
+    bool skip_;
+    bool skipFirst_;
+    int dimState_;
+    std::vector<Ptr<RNN<Cell>>> rnns_;
+
+  public:
+
+    template <typename ...Args>
+    MLRNN(Ptr<ExpressionGraph> graph,
+          const std::string& name,
+          int layers,
+          int dimInput,
+          int dimState,
+          Args ...args)
+    : Layer(name),
+      layers_(layers),
+      skip_(Get(keywords::skip, false, args...)),
+      skipFirst_(Get(keywords::skip_first, false, args...)),
+      dimState_{dimState} {
+      for(int i = 0; i < layers; ++i) {
+        rnns_.push_back(
+          New<RNN<Cell>>(graph,
+                         name + "_l" + std::to_string(i),
+                         i == 0 ? dimInput : dimState,
+                         dimState,
+                         args...)
+        );
+      }
+    }
+
+    template <typename ...Args>
+    std::tuple<Expr, std::vector<Expr>>
+    operator()(Expr input, Args ...args) {
+      Expr output;
+      std::vector<Expr> outStates;
+      for(int i = 0; i < layers_; ++i) {
+        auto outState = (*rnns_[i])(input, args...);
+        outStates.push_back(outState);
+
+        if(skip_ && (skipFirst_ || i > 0))
+          output = outState + input;
+        else
+          output = outState;
+
+        input = output;
+      }
+      return std::make_tuple(output, outStates);
+    }
+
+    template <typename ...Args>
+    std::tuple<Expr, std::vector<Expr>>
+    operator()(Expr input,
+               std::vector<Expr> states,
+               Args ...args) {
+      Expr output;
+      std::vector<Expr> outStates;
+      for(int i = 0; i < layers_; ++i) {
+        auto outState = (*rnns_[i])(input, states[i], args...);
+        outStates.push_back(outState);
+
+        if(skip_ && (skipFirst_ || i > 0))
+          output = outState + input;
+        else
+          output = outState;
+
+        input = output;
+      }
+      return std::make_tuple(output, outStates);
+    }
+};
+
 template <class Cell>
 class BiRNN : public Layer {
   public:
+    int layers_;
     int dimState_;
 
     Ptr<RNN<Cell>> rnn1_;
     Ptr<RNN<Cell>> rnn2_;
 
     template <typename ...Args>
-    BiRNN(const std::string& name,
-         int dimState,
-         Cell cell1,
-         Cell cell2,
-         Args ...args)
+    BiRNN(Ptr<ExpressionGraph> graph,
+          const std::string& name,
+          int layers,
+          int dimInput,
+          int dimState,
+          Args ...args)
     : Layer(name),
       dimState_{dimState},
-      rnn1_(New<RNN<Cell>>(name, dimState, cell1,
-                           keywords::direction=dir::forward,
-                           args...)),
-      rnn2_(New<RNN<Cell>>(name + "_r", dimState, cell2,
-                           keywords::direction=dir::backward,
-                           args...)) {}
+      rnn1_(New<MLRNN<Cell>>(graph, name, layers, dimInput, dimState,
+                             keywords::direction=dir::forward,
+                             args...)),
+      rnn2_(New<MLRNN<Cell>>(graph, name + "_r", layers, dimInput, dimState,
+                             keywords::direction=dir::backward,
+                             args...)) {}
 
     template <typename ...Args>
-    BiRNN(const std::string& name,
-         int dimState,
-         Args ...args)
-    : BiRNN(name, dimState, Cell(), Cell(), args...) {}
+    std::vector<Expr> operator()(Expr input, Args ...args) {
+      Expr mask = Get(keywords::mask, nullptr, args...);
+      auto statesfw = (*rnn1_)(input);
+      auto statesbw = (*rnn2_)(input, keywords::mask=mask);
 
-    template <typename ...Args>
-    Expr operator()(Expr input, Args ...args) {
-      auto graph = input->graph();
-      int dimBatch = input->shape()[0];
-      auto startState = graph->zeros(keywords::shape={dimBatch, dimState_});
-      return (*this)(input, startState, args...);
+      std::vector<Expr> outStates;
+      for(int i = 0; i < layers_; ++i)
+        outStates.push_back(concatenate({statesfw[i], statesbw[i]},
+                                        keywords::axis=1));
+      return outStates;
     }
 
     template <typename ...Args>
-    Expr operator()(Expr input, Expr state, Args ...args) {
+    std::vector<Expr> operator()(Expr input, std::vector<Expr> states, Args ...args) {
       Expr mask = Get(keywords::mask, nullptr, args...);
+      auto statesfw = (*rnn1_)(input, states);
+      auto statesbw = (*rnn2_)(input, states, keywords::mask=mask);
 
-      auto graph = input->graph();
-      int dimInput = input->shape()[1];
-
-      rnn1_->getCell()->initialize(graph, name_, dimInput, dimState_, args...);
-      auto states1 = rnn1_->apply(input, state, nullptr);
-
-      rnn2_->getCell()->initialize(graph, name_ + "_r", dimInput, dimState_, args...);
-      auto states2 = rnn2_->apply(input, state, mask, true);
-
-      std::reverse(states2.begin(), states2.end());
-      std::vector<Expr> states;
-      for(int i = 0; i < states1.size(); ++i)
-        states.push_back(concatenate({states1[i], states2[i]},
-                                     keywords::axis=1));
-
-      return concatenate(states, keywords::axis=2);
+      std::vector<Expr> outStates;
+      for(int i = 0; i < layers_; ++i)
+        outStates.push_back(concatenate({statesfw[i], statesbw[i]},
+                                        keywords::axis=1));
+      return outStates;
     }
 };
 
@@ -255,21 +323,32 @@ Expr gruOps(const std::vector<Expr>& nodes, bool final = false) {
   return Expression<GRUFastNodeOp>(nodes, final);
 }
 
+/***************************************************************/
+
 class GRU {
   private:
+    std::string prefix_;
+
     Expr U_, W_, b_;
+    Expr gamma1_;
+    Expr gamma2_;
+
     bool final_;
+    bool layerNorm_;
+    float dropout_;
+
+    Expr dropMaskX_;
+    Expr dropMaskS_;
 
   public:
-    GRU() {}
 
     template <typename ...Args>
-    void initialize(
-        ExpressionGraphPtr graph,
+    GRU(ExpressionGraphPtr graph,
         const std::string prefix,
         int dimInput,
         int dimState,
-        Args ...args) {
+        Args ...args) : prefix_(prefix) {
+
       auto U = graph->param(prefix + "_U", {dimState, 2 * dimState},
                                keywords::init=inits::glorot_uniform);
       auto W = graph->param(prefix + "_W", {dimInput, 2 * dimState},
@@ -288,19 +367,49 @@ class GRU {
       b_ = concatenate({b, bx}, keywords::axis=1);
 
       final_ = Get(keywords::final, false, args...);
+      layerNorm_ = Get(keywords::normalize, false, args...);
+
+      dropout_ = Get(keywords::dropout_prob, 0.0f, args...);
+
+      if(layerNorm_) {
+        gamma1_ = graph->param(prefix + "_gamma1", {1, 3 * dimState},
+                               keywords::init=inits::from_value(1.f));
+        gamma2_ = graph->param(prefix + "_gamma2", {1, 3 * dimState},
+                               keywords::init=inits::from_value(1.f));
+      }
+
+      if(dropout_> 0.0f) {
+        dropMaskX_ = graph->dropout(dropout_, {1, dimInput});
+        dropMaskS_ = graph->dropout(dropout_, {1, dimState});
+      }
     }
 
-    Expr apply(Expr input, Expr state, Expr mask = nullptr) {
+    Expr apply(Expr input, Expr state,
+               Expr mask = nullptr) {
       return apply2(apply1(input), state, mask);
     }
 
     Expr apply1(Expr input) {
+      if(dropMaskX_)
+        input = dropout(input, keywords::mask=dropMaskX_);
+      debug(input, "in");
       auto xW = dot(input, W_);
+      if(layerNorm_)
+        xW = layer_norm(xW, gamma1_);
       return xW;
     }
 
-    Expr apply2(Expr xW, Expr state, Expr mask = nullptr) {
+    Expr apply2(Expr xW, Expr state,
+                Expr mask = nullptr) {
+      if(dropMaskS_)
+        state = dropout(state, keywords::mask=dropMaskS_);
+      debug(state, "state");
+
       auto sU = dot(state, U_);
+
+      if(layerNorm_)
+        sU = layer_norm(sU, gamma2_);
+
       auto output = mask ?
         gruOps({state, xW, sU, b_, mask}, final_) :
         gruOps({state, xW, sU, b_}, final_);
@@ -309,6 +418,7 @@ class GRU {
     }
 };
 
+
 /***************************************************************/
 
 template <class Cell1, class Attention, class Cell2>
@@ -320,31 +430,29 @@ class AttentionCell {
 
   public:
 
-    AttentionCell(Attention&& att)
-    : cell1_(New<Cell1>()),
-      cell2_(New<Cell2>()),
-      att_(New<Attention>(att)) {}
-
-    template <typename ...Args>
-    void initialize(Ptr<ExpressionGraph> graph,
-                    const std::string prefix,
-                    int dimInput,
-                    int dimState,
-                    Args ...args)
+    template <class ...Args>
+    AttentionCell(Ptr<ExpressionGraph> graph,
+                  const std::string prefix,
+                  int dimInput,
+                  int dimState,
+                  Ptr<Attention> att,
+                  Args ...args)
     {
-      cell1_->initialize(graph,
-                        prefix + "_cell1",
-                        dimInput,
-                        dimState,
-                        keywords::final=false,
-                        args...);
+      cell1_ = New<Cell1>(graph,
+                          prefix + "_cell1",
+                          dimInput,
+                          dimState,
+                          keywords::final=false,
+                          args...);
 
-      cell2_->initialize(graph,
-                        prefix + "_cell2",
-                        att_->outputDim(),
-                        dimState,
-                        keywords::final=true,
-                        args...);
+      att_ = New<Attention>(att);
+
+      cell2_ = New<Cell2>(graph,
+                          prefix + "_cell2",
+                          att_->outputDim(),
+                          dimState,
+                          keywords::final=true,
+                          args...);
     }
 
     Expr apply(Expr input, Expr state, Expr mask = nullptr) {
@@ -361,11 +469,17 @@ class AttentionCell {
       return cell2_->apply(alignedSourceContext, hidden, mask);
     }
 
+    Ptr<Attention> getAttention() {
+      return att_;
+    }
+
     Expr getContexts() {
       return concatenate(att_->getContexts(), keywords::axis=2);
     }
+
+    Expr getLastContext() {
+      return att_->getContexts().back();
+    }
 };
 
-typedef AttentionCell<GRU, GlobalAttention, GRU> CGRU;
-
 }
diff --git a/src/marian.h b/src/marian.h
index 7e4fecb6..ba646e0b 100644
--- a/src/marian.h
+++ b/src/marian.h
@@ -25,3 +25,5 @@
 #include "graph/expression_graph.h"
 #include "graph/expression_operators.h"
 #include "layers/param_initializers.h"
+#include "training/training.h"
+#include "training/graph_group.h"
diff --git a/src/models/nematus.h b/src/models/dl4mt.h
similarity index 62%
rename from src/models/nematus.h
rename to src/models/dl4mt.h
index d80e5a5d..5e07fba7 100644
--- a/src/models/nematus.h
+++ b/src/models/dl4mt.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include "data/corpus.h"
-#include "command/config.h"
+#include "training/config.h"
 #include "graph/expression_graph.h"
 #include "layers/rnn.h"
 #include "layers/param_initializers.h"
@@ -11,10 +11,12 @@
 
 namespace marian {
 
-class Nematus : public ExpressionGraph {
+class DL4MT {
   private:
     Ptr<Config> options_;
-    
+
+    Ptr<RNN<CGRU>> rnn_;
+
     int dimSrcVoc_{40000};
     int dimSrcEmb_{512};
     int dimEncState_{1024};
@@ -25,6 +27,8 @@ class Nematus : public ExpressionGraph {
 
     int dimBatch_{64};
 
+    bool normalize_;
+
     void setDims(Ptr<ExpressionGraph> graph,
                  Ptr<data::CorpusBatch> batch) {
       dimSrcVoc_ = graph->get("Wemb") ? graph->get("Wemb")->shape()[0] : dimSrcVoc_;
@@ -39,14 +43,14 @@ class Nematus : public ExpressionGraph {
     }
 
   public:
-    
-    Nematus() {}
-    
-    Nematus(Ptr<Config> options)
+
+    DL4MT(Ptr<Config> options)
     : options_(options) {
-    
+
       auto dimVocabs = options->get<std::vector<int>>("dim-vocabs");
-      
+
+      normalize_ = options->get<bool>("normalize");
+
       dimSrcVoc_   = dimVocabs[0];
       dimSrcEmb_   = options->get<int>("dim-emb");
       dimEncState_ = options->get<int>("dim-rnn");
@@ -56,16 +60,16 @@ class Nematus : public ExpressionGraph {
       dimBatch_    = options->get<int>("mini-batch");
     }
 
-  
+
     void load(Ptr<ExpressionGraph> graph,
               const std::string& name) {
       using namespace keywords;
 
       LOG(info) << "Loading model from " << name;
-      
+
       auto numpy = cnpy::npz_load(name);
 
-      auto parameters = {
+      std::vector<std::string> parameters = {
         // Source word embeddings
         "Wemb",
 
@@ -102,6 +106,20 @@ class Nematus : public ExpressionGraph {
         "ff_logit_W", "ff_logit_b",
       };
 
+      std::vector<std::string> parametersNorm = {
+        "decoder_att_gamma1", "decoder_att_gamma2",
+        "decoder_cell1_gamma1", "decoder_cell1_gamma2",
+        "decoder_cell2_gamma1", "decoder_cell2_gamma2",
+        "encoder_gamma1", "encoder_gamma2",
+        "encoder_r_gamma1", "encoder_r_gamma2",
+        "ff_logit_l1_gamma0", "ff_logit_l1_gamma1",
+        "ff_logit_l1_gamma2", "ff_state_gamma"
+      };
+
+      if(normalize_)
+        for(auto& p : parametersNorm)
+          parameters.push_back(p);
+
       std::map<std::string, std::string> nameMap = {
         {"decoder_U", "decoder_cell1_U"},
         {"decoder_W", "decoder_cell1_W"},
@@ -129,6 +147,9 @@ class Nematus : public ExpressionGraph {
       };
 
       for(auto name : parameters) {
+        UTIL_THROW_IF2(numpy.count(name) == 0,
+                       "Parameter " << name << " does not exist.");
+
         Shape shape;
         if(numpy[name].shape.size() == 2) {
           shape.set(0, numpy[name].shape[0]);
@@ -152,7 +173,7 @@ class Nematus : public ExpressionGraph {
               const std::string& name) {
 
       LOG(info) << "Saving model to " << name;
-      
+
       unsigned shape[2];
       std::string mode = "w";
 
@@ -274,54 +295,148 @@ class Nematus : public ExpressionGraph {
       return std::make_tuple(y, yMask, yIdx);
     }
 
+    std::tuple<Expr, Expr> encoder(Ptr<ExpressionGraph> graph,
+                                   Ptr<data::CorpusBatch> batch) {
+      using namespace keywords;
+
+      auto xEmb = Embedding("Wemb", dimSrcVoc_, dimSrcEmb_)(graph);
+
+      Expr x, xMask;
+      std::tie(x, xMask) = prepareSource(xEmb, batch, 0);
+
+      auto xfw = RNN<GRU>(graph, "encoder",
+                          dimSrcEmb_, dimEncState_,
+                          normalize=normalize_,
+                          direction=dir::forward)(x);
+
+      auto xbw = RNN<GRU>(graph, "encoder_r",
+                          dimSrcEmb_, dimEncState_,
+                          normalize=normalize_,
+                          direction=dir::backward)(x, mask=xMask);
+
+      auto xContext = concatenate({xfw, xbw}, axis=1);
+
+      return std::make_tuple(xContext, xMask);
+    }
+
+    std::tuple<Expr, Expr> step(Expr hyps,
+                                const std::vector<size_t> hypIdx = {},
+                                const std::vector<size_t> embIdx = {}) {
+      using namespace keywords;
+      auto graph = hyps->graph();
+
+      Expr selectedHyps, selectedEmbs;
+      if(embIdx.empty()) {
+        selectedHyps = hyps;
+        selectedEmbs = graph->constant(shape={1, dimTrgEmb_},
+                                       init=inits::zeros);
+      }
+      else {
+        // @TODO : solve this better than reshaping!
+        selectedHyps = reshape(rows(hyps, hypIdx),
+                               {1, hyps->shape()[1], 1, (int)hypIdx.size()});
+
+        auto yEmb = Embedding("Wemb_dec", dimTrgVoc_, dimTrgEmb_)(graph);
+        selectedEmbs = reshape(rows(yEmb, embIdx),
+                               {1, yEmb->shape()[1], 1, (int)embIdx.size()});
+      }
+      Expr newHyps, logits;
+      std::tie(newHyps, logits) = step(selectedHyps, selectedEmbs, true);
+      return std::make_tuple(newHyps, logsoftmax(logits));
+    }
+
+    std::tuple<Expr, Expr> step(Expr yInStates, Expr yEmbeddings,
+                                bool single = false) {
+      using namespace keywords;
+
+      auto yOutStates = (*rnn_)(yEmbeddings, yInStates);
+      auto yCtx = single ?
+        rnn_->getCell()->getLastContext() :
+        rnn_->getCell()->getContexts();
+
+      //// 2-layer feedforward network for outputs and cost
+      auto yLogitsL1 = Dense("ff_logit_l1", dimTrgEmb_,
+                             activation=act::tanh,
+                             normalize=normalize_)
+                         (yEmbeddings, yOutStates, yCtx);
+
+      auto yLogitsL2 = Dense("ff_logit_l2", dimTrgVoc_)
+                         (yLogitsL1);
+
+      return std::make_tuple(yOutStates, yLogitsL2);
+    }
+
+    Expr startState(Expr context, Expr mask) {
+      using namespace keywords;
+
+      auto meanContext = weighted_average(context, mask, axis=2);
+      auto start = Dense("ff_state",
+                         dimDecState_,
+                         activation=act::tanh,
+                         normalize=normalize_)(meanContext);
+      return start;
+    }
+
+    Expr buildEncoder(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch> batch) {
+      using namespace keywords;
+      graph->clear();
+      rnn_.reset();
+      setDims(graph, batch);
+
+      Expr xContext, xMask;
+      std::tie(xContext, xMask) = encoder(graph, batch);
+
+      auto attention = New<GlobalAttention>("decoder",
+                                            xContext, dimDecState_,
+                                            mask=xMask, normalize=normalize_);
+      rnn_ = New<RNN<CGRU>>(graph, "decoder",
+                            dimTrgEmb_, dimDecState_,
+                            attention,
+                            normalize=normalize_);
+
+      return startState(xContext, xMask);
+    }
+
+    std::tuple<Expr, Expr, Expr> embeddings(Ptr<ExpressionGraph> graph,
+                                            Ptr<data::CorpusBatch> batch) {
+      using namespace keywords;
+
+      auto yEmb = Embedding("Wemb_dec", dimTrgVoc_, dimTrgEmb_)(graph);
+      Expr y, yMask, yIdx;
+      std::tie(y, yMask, yIdx) = prepareTarget(yEmb, batch, 1);
+      auto yEmpty = graph->zeros(shape={dimBatch_, dimTrgEmb_});
+      auto yShifted = concatenate({yEmpty, y}, axis=2);
+
+      return std::make_tuple(yShifted, yMask, yIdx);
+    }
+
     Expr build(Ptr<ExpressionGraph> graph,
                Ptr<data::CorpusBatch> batch) {
       using namespace keywords;
       graph->clear();
-
+      rnn_.reset();
       setDims(graph, batch);
 
-      // Embeddings
-      auto xEmb = Embedding("Wemb", dimSrcVoc_, dimSrcEmb_)(graph);
-      auto yEmb = Embedding("Wemb_dec", dimTrgVoc_, dimTrgEmb_)(graph);
+      Expr xContext, xMask;
+      std::tie(xContext, xMask) = encoder(graph, batch);
+      auto yStartStates = startState(xContext, xMask);
 
-      Expr x, xMask;
-      Expr y, yMask, yIdx;
+      Expr yEmbeddings, yMask, yIdx;
+      std::tie(yEmbeddings, yMask, yIdx) = embeddings(graph, batch);
 
-      std::tie(x, xMask) = prepareSource(xEmb, batch, 0);
-      std::tie(y, yMask, yIdx) = prepareTarget(yEmb, batch, 1);
+      auto attention = New<GlobalAttention>("decoder",
+                                            xContext, dimDecState_,
+                                            mask=xMask, normalize=normalize_);
+      rnn_ = New<RNN<CGRU>>(graph, "decoder",
+                            dimTrgEmb_, dimDecState_,
+                            attention,
+                            normalize=normalize_);
 
-      // Encoder
-      auto xContext = BiRNN<GRU>("encoder", dimEncState_)
-                        (x, mask=xMask);
+      Expr yOutStates, yLogits;
+      std::tie(yOutStates, yLogits) = step(yStartStates, yEmbeddings);
 
-      auto xMeanContext = weighted_average(xContext, xMask, axis=2);
+      auto cost = CrossEntropyCost("cost")(yLogits, yIdx, mask=yMask);
 
-      // Decoder
-      auto yStart = Dense("ff_state",
-                          dimDecState_,
-                          activation=act::tanh)(xMeanContext);
-
-      auto yEmpty = graph->zeros(shape={dimBatch_, dimTrgEmb_});
-      auto yShifted = concatenate({yEmpty, y}, axis=2);
-      //auto yShifted = shift(y, 1, axis=2);
-
-      CGRU cgru({"decoder", xContext, dimDecState_, mask=xMask});
-      auto yLstm = RNN<CGRU>("decoder", dimDecState_, cgru)
-                     (yShifted, yStart);
-      auto yCtx = cgru.getContexts();
-
-      //// 2-layer feedforward network for outputs and cost
-      auto ff_logit_l1 = Dense("ff_logit_l1", dimTrgEmb_,
-                               activation=act::tanh)
-                           (yShifted, yLstm, yCtx);
-
-      auto ff_logit_l2 = Dense("ff_logit_l2", dimTrgVoc_)
-                           (ff_logit_l1);
-
-      auto cost = CrossEntropyCost("cost")
-                    (ff_logit_l2, yIdx, mask=yMask);
-                    
       return cost;
     }
 };
diff --git a/src/models/encdec.h b/src/models/encdec.h
new file mode 100644
index 00000000..0a494e4a
--- /dev/null
+++ b/src/models/encdec.h
@@ -0,0 +1,210 @@
+#pragma once
+
+#include "data/corpus.h"
+#include "training/config.h"
+#include "graph/expression_graph.h"
+#include "layers/rnn.h"
+#include "layers/param_initializers.h"
+#include "layers/generic.h"
+#include "common/logging.h"
+
+namespace marian {
+
+class EncoderBase {
+  protected:
+    Ptr<Config> options_;
+
+    virtual std::tuple<Expr, Expr>
+    prepareSource(Expr emb, Ptr<data::CorpusBatch> batch, size_t index) {
+      using namespace keywords;
+      std::vector<size_t> indeces;
+      std::vector<float> mask;
+
+      for(auto& word : (*batch)[index]) {
+        for(auto i: word.first)
+          indeces.push_back(i);
+        for(auto m: word.second)
+          mask.push_back(m);
+      }
+
+      int dimBatch = batch->size();
+      int dimEmb = emb->shape()[1];
+      int dimWords = (int)(*batch)[index].size();
+
+      auto graph = emb->graph();
+      auto x = reshape(rows(emb, indeces), {dimBatch, dimEmb, dimWords});
+      auto xMask = graph->constant(shape={dimBatch, 1, dimWords},
+                                   init=inits::from_vector(mask));
+      return std::make_tuple(x, xMask);
+    }
+
+  public:
+    EncoderBase(Ptr<Config> options)
+     : options_(options) {}
+
+    virtual std::tuple<Expr, Expr>
+    build(Ptr<ExpressionGraph>, Ptr<data::CorpusBatch>, size_t = 0) = 0;
+};
+
+class DecoderBase {
+  protected:
+    Ptr<Config> options_;
+
+    virtual std::tuple<Expr, Expr, Expr>
+    prepareTarget(Expr emb, Ptr<data::CorpusBatch> batch, size_t index) {
+      using namespace keywords;
+
+      std::vector<size_t> indeces;
+      std::vector<float> mask;
+      std::vector<float> findeces;
+
+      for(int j = 0; j < (*batch)[index].size(); ++j) {
+        auto& trgWordBatch = (*batch)[index][j];
+
+        for(auto i : trgWordBatch.first) {
+          findeces.push_back((float)i);
+          if(j < (*batch)[index].size() - 1)
+            indeces.push_back(i);
+        }
+
+        for(auto m : trgWordBatch.second)
+            mask.push_back(m);
+      }
+
+      int dimBatch = batch->size();
+      int dimEmb = emb->shape()[1];
+      int dimWords = (int)(*batch)[index].size();
+
+      auto graph = emb->graph();
+
+      auto y = reshape(rows(emb, indeces),
+                       {dimBatch, dimEmb, dimWords - 1});
+
+      auto yMask = graph->constant(shape={dimBatch, 1, dimWords},
+                                  init=inits::from_vector(mask));
+      auto yIdx = graph->constant(shape={(int)findeces.size(), 1},
+                                  init=inits::from_vector(findeces));
+
+      return std::make_tuple(y, yMask, yIdx);
+    }
+
+  public:
+    DecoderBase(Ptr<Config> options)
+     : options_(options) {}
+
+    virtual std::tuple<Expr, Expr, Expr>
+    groundTruth(Ptr<ExpressionGraph> graph,
+                Ptr<data::CorpusBatch> batch) {
+      using namespace keywords;
+
+      int dimBatch  = batch->size();
+      int dimTrgVoc = options_->get<std::vector<int>>("dim-vocabs").back();
+      int dimTrgEmb = options_->get<int>("dim-emb");
+
+      auto yEmb = Embedding("Wemb_dec", dimTrgVoc, dimTrgEmb)(graph);
+      Expr y, yMask, yIdx;
+      std::tie(y, yMask, yIdx) = prepareTarget(yEmb, batch, 1);
+      auto yEmpty = graph->zeros(shape={dimBatch, dimTrgEmb});
+      auto yShifted = concatenate({yEmpty, y}, axis=2);
+
+      return std::make_tuple(yShifted, yMask, yIdx);
+    }
+
+    virtual Expr
+    buildStartState(Expr context, Expr mask) {
+      using namespace keywords;
+
+      auto meanContext = weighted_average(context, mask, axis=2);
+
+      bool layerNorm = options_->get<bool>("normalize");
+      auto start = Dense("ff_state",
+                         options_->get<int>("dim-rnn"),
+                         activation=act::tanh,
+                         normalize=layerNorm)(meanContext);
+      return start;
+    }
+
+    virtual std::tuple<Expr, std::vector<Expr>>
+    step(Expr embeddings, std::vector<Expr> states,
+         Expr context, Expr contextMask, bool single=false) = 0;
+};
+
+template <class Encoder, class Decoder>
+class Seq2Seq {
+  protected:
+    Ptr<Config> options_;
+    Ptr<EncoderBase> encoder_;
+    Ptr<DecoderBase> decoder_;
+
+  public:
+
+    Seq2Seq(Ptr<Config> options)
+     : options_(options),
+       encoder_(New<Encoder>(options)),
+       decoder_(New<Decoder>(options))
+    {}
+
+     virtual void load(Ptr<ExpressionGraph> graph,
+                       const std::string& name) {
+      graph->load(name);
+    }
+
+    virtual void save(Ptr<ExpressionGraph> graph,
+                      const std::string& name) {
+      graph->save(name);
+    }
+
+    virtual std::tuple<std::vector<Expr>, Expr, Expr>
+    buildEncoder(Ptr<ExpressionGraph> graph,
+                 Ptr<data::CorpusBatch> batch) {
+      using namespace keywords;
+      graph->clear();
+      encoder_ = New<Encoder>(options_);
+      decoder_ = New<Decoder>(options_);
+
+      Expr srcContext, srcMask;
+      std::tie(srcContext, srcMask) = encoder_->build(graph, batch);
+      auto startState = decoder_->buildStartState(srcContext, srcMask);
+
+      size_t decoderLayers = options_->get<size_t>("layers-dec");
+      std::vector<Expr> startStates(decoderLayers, startState);
+
+      return std::make_tuple(startStates, srcContext, srcMask);
+    }
+
+    virtual std::tuple<Expr, std::vector<Expr>>
+    step(Expr embeddings,
+         std::vector<Expr> states,
+         Expr context,
+         Expr contextMask,
+         bool single=false) {
+      return decoder_->step(embeddings, states, context, contextMask, single);
+    }
+
+    virtual Expr build(Ptr<ExpressionGraph> graph,
+                       Ptr<data::CorpusBatch> batch) {
+      using namespace keywords;
+
+      std::vector<Expr> startStates;
+      Expr srcContext, srcMask;
+      std::tie(startStates, srcContext, srcMask) = buildEncoder(graph, batch);
+
+      Expr trgEmbeddings, trgMask, trgIdx;
+      std::tie(trgEmbeddings, trgMask, trgIdx) = decoder_->groundTruth(graph, batch);
+
+      Expr trgLogits;
+      std::vector<Expr> trgStates;
+      std::tie(trgLogits, trgStates) = decoder_->step(trgEmbeddings,
+                                                      startStates,
+                                                      srcContext,
+                                                      srcMask);
+
+      auto cost = CrossEntropyCost("cost")(trgLogits, trgIdx,
+                                           mask=trgMask);
+
+      return cost;
+    }
+
+};
+
+}
diff --git a/src/models/feedforward.h b/src/models/feedforward.h
deleted file mode 100644
index e2e6ad7b..00000000
--- a/src/models/feedforward.h
+++ /dev/null
@@ -1,119 +0,0 @@
-#pragma once
-
-#include "graph/expression_graph.h"
-
-namespace marian {
-
-/**
- * @brief Namespace for code related to managing models in Marian
- */
-namespace models {
-
-/**
- * @brief Constructs an expression graph representing a feed-forward classifier.
- *
- * @param dims number of nodes in each layer of the feed-forward classifier
- *
- * @return a shared pointer to the newly constructed expression graph
- */
-void FeedforwardClassifier(ExpressionGraphPtr g,
-                           const std::vector<int>& dims,
-                           size_t batchSize,
-                           bool training = true) {
-  using namespace keywords;
-  std::cerr << "Building Multi-layer Feedforward network" << std::endl;
-  std::cerr << "\tLayer dimensions:";
-  for(auto d : dims)
-    std::cerr << " " << d;
-  std::cerr << std::endl;
-  boost::timer::cpu_timer timer;
-
-  // Construct a shared pointer to an empty expression graph
-  g->clear();
-
-  // Construct an input node called "x" and add it to the expression graph.
-  //
-  // For each observed data point, this input will hold a vector of values describing that data point.
-  // dims.front() specifies the size of this vector
-  //
-  // For example, in the MNIST task, for any given image in the training set,
-  //     "x" would hold a vector of pixel values for that image.
-  //
-  // Because calculating over one observed data point at a time can be inefficient,
-  //     it is customary to operate over a batch of observed data points at once.
-  //
-  // At this point, we do not know the batch size:
-  // whatevs therefore serves as a placeholder for the batch size, which will be specified later
-  //
-  // Once the batch size is known, "x" will represent a matrix with dimensions [batch_size, dims.front()].
-  // Each row of this matrix will correspond with the observed data vector for one observed data point.
-  auto x = name(g->input(shape={(int)batchSize, dims.front()}), "x");
-
-  // Construct an input node called "y" and add it to the expression graph.
-  //
-  // For each observed data point, this input will hold the ground truth label for that data point.
-  // dims.back() specifies the size of this vector
-  //
-  // For example, in the MNIST task, for any given image in the training set,
-  //     "y" might hold one-hot vector representing which digit (0-9) is shown in that image
-  //
-  // Because calculating over one observed data point at a time can be inefficient,
-  //     it is customary to operate over a batch of observed data points at once.
-  //
-  // At this point, we do not know the batch size:
-  // whatevs therefore serves as a placeholder for the batch size, which will be specified later
-  //
-  // Once the batch size is known, "y" will represent a matrix with dimensions [batch_size, dims.front()].
-  // Each row of this matrix will correspond with the ground truth data vector for one observed data point.
-  auto y = name(g->input(shape={(int)batchSize, 1}), "y");
-
-  std::vector<Expr> layers, weights, biases;
-  for(int i = 0; i < dims.size()-1; ++i) {
-    int in = dims[i];
-    int out = dims[i+1];
-
-    if(i == 0) {
-      // Create a dropout node as the parent of x,
-      //   and place that dropout node as the value of layers[0]
-      layers.emplace_back(dropout(x, value=0.2));
-    } else {
-      // Multiply the matrix in layers[i-1] by the matrix in weights[i-1]
-      // Take the result, and perform matrix addition on biases[i-1].
-      // Wrap the result in rectified linear activation function,
-      // and finally wrap that in a dropout node
-      layers.emplace_back(dropout(relu(affine(layers.back(), weights.back(), biases.back())),
-                                  value=0.5));
-    }
-
-    // Construct a weight node for the outgoing connections from layer i
-    weights.emplace_back(
-      g->param("W" + std::to_string(i), {in, out},
-               init=inits::uniform()));
-
-    // Construct a bias node. By definition, a bias node stores the value 1.
-    //    Therefore, we don't actually store the 1.
-    //    Instead, the bias node object stores the weights on the connections
-    //      that are outgoing from the bias node.
-    //    These weights are initialized to zero
-    biases.emplace_back(
-      g->param("b" + std::to_string(i), {1, out},
-               init=inits::zeros));
-  }
-
-  // Perform matrix multiplication and addition for the last layer
-  auto last = affine(layers.back(), weights.back(), biases.back());
-
-  if(training) {
-  // Define a top-level node for training
-    auto cost = name(mean(cross_entropy(last, y), axis=0), "cost");
-  }
-  else {
-    // Define a top-level node for inference
-    auto scores = name(softmax(last), "scores");
-  }
-
-  std::cerr << "\tTotal time: " << timer.format(5, "%ws") << std::endl;
-};
-
-}
-}
diff --git a/src/models/gnmt.h b/src/models/gnmt.h
new file mode 100644
index 00000000..634f03e9
--- /dev/null
+++ b/src/models/gnmt.h
@@ -0,0 +1,153 @@
+#pragma once
+
+#include "models/encdec.h"
+#include "layers/attention.h"
+
+namespace marian {
+
+  typedef AttentionCell<GRU, GlobalAttention, GRU> CGRU;
+
+  class EncoderGNMT : public EncoderBase {
+  public:
+    EncoderGNMT(Ptr<Config> options)
+     : EncoderBase(options) {}
+
+    std::tuple<Expr, Expr>
+    build(Ptr<ExpressionGraph> graph,
+          Ptr<data::CorpusBatch> batch,
+          size_t batchIdx = 0) {
+
+      using namespace keywords;
+
+      int dimSrcVoc = options_->get<std::vector<int>>("dim-vocabs")[batchIdx];
+      int dimSrcEmb = options_->get<int>("dim-emb");
+      int dimEncState = options_->get<int>("dim-rnn");
+      bool layerNorm = options_->get<bool>("normalize");
+      bool skipDepth = options_->get<bool>("skip");
+      size_t encoderLayers = options_->get<size_t>("layers-enc");
+      float dropoutRnn = options_->get<float>("dropout-rnn");
+
+      auto xEmb = Embedding("Wemb", dimSrcVoc, dimSrcEmb)(graph);
+
+      Expr x, xMask;
+      std::tie(x, xMask) = prepareSource(xEmb, batch, batchIdx);
+
+      auto xFw = RNN<GRU>(graph, "encoder_bi",
+                          dimSrcEmb, dimEncState,
+                          normalize=layerNorm,
+                          dropout_prob=dropoutRnn)
+                         (x);
+
+      auto xBw = RNN<GRU>(graph, "encoder_bi_r",
+                          dimSrcEmb, dimEncState,
+                          normalize=layerNorm,
+                          direction=dir::backward,
+                          dropout_prob=dropoutRnn)
+                         (x, mask=xMask);
+
+      debug(xFw, "xFw");
+      if(encoderLayers > 1) {
+        auto xBi = concatenate({xFw, xBw}, axis=1);
+
+        Expr xContext;
+        std::vector<Expr> states;
+        std::tie(xContext, states)
+          = MLRNN<GRU>(graph, "encoder", encoderLayers - 1,
+                       2 * dimEncState, dimEncState,
+                       normalize=layerNorm,
+                       skip=skipDepth,
+                       dropout_prob=dropoutRnn)
+                      (xBi);
+        return std::make_tuple(xContext, xMask);
+      }
+      else {
+        auto xContext = concatenate({xFw, xBw}, axis=1);
+        return std::make_tuple(xContext, xMask);
+      }
+    }
+};
+
+class DecoderGNMT : public DecoderBase {
+  private:
+    Ptr<GlobalAttention> attention_;
+
+  public:
+    DecoderGNMT(Ptr<Config> options)
+     : DecoderBase(options) {}
+
+    virtual std::tuple<Expr, std::vector<Expr>>
+    step(Expr embeddings,
+         std::vector<Expr> states,
+         Expr context,
+         Expr contextMask,
+         bool single) {
+      using namespace keywords;
+
+      int dimTrgVoc = options_->get<std::vector<int>>("dim-vocabs").back();
+      int dimTrgEmb = options_->get<int>("dim-emb");
+      int dimDecState = options_->get<int>("dim-rnn");
+      bool layerNorm = options_->get<bool>("normalize");
+      bool skipDepth = options_->get<bool>("skip");
+      size_t decoderLayers = options_->get<size_t>("layers-dec");
+      float dropoutRnn = options_->get<float>("dropout-rnn");
+
+      auto graph = embeddings->graph();
+
+      if(!attention_)
+        attention_ = New<GlobalAttention>("decoder",
+                                          context, dimDecState,
+                                          mask=contextMask,
+                                          normalize=layerNorm);
+      RNN<CGRU> rnnL1(graph, "decoder",
+                      dimTrgEmb, dimDecState,
+                      attention_,
+                      normalize=layerNorm,
+                      dropout_prob=dropoutRnn);
+      auto stateL1 = rnnL1(embeddings, states[0]);
+      auto alignedContext = single ?
+        rnnL1.getCell()->getLastContext() :
+        rnnL1.getCell()->getContexts();
+
+      std::vector<Expr> statesOut;
+      statesOut.push_back(stateL1);
+
+      Expr outputLn;
+      if(decoderLayers > 1) {
+        std::vector<Expr> statesIn;
+        for(int i = 1; i < states.size(); ++i)
+          statesIn.push_back(states[i]);
+
+        std::vector<Expr> statesLn;
+        std::tie(outputLn, statesLn) = MLRNN<GRU>(graph, "decoder",
+                                                  decoderLayers - 1,
+                                                  dimDecState, dimDecState,
+                                                  normalize=layerNorm,
+                                                  dropout_prob=dropoutRnn,
+                                                  skip=skipDepth,
+                                                  skip_first=skipDepth)
+                                                 (stateL1, statesIn);
+
+        statesOut.insert(statesOut.end(),
+                         statesLn.begin(), statesLn.end());
+      }
+      else {
+        outputLn = stateL1;
+      }
+
+      //// 2-layer feedforward network for outputs and cost
+      auto logitsL1 = Dense("ff_logit_l1", dimTrgEmb,
+                            activation=act::tanh,
+                            normalize=layerNorm)
+                        (embeddings, outputLn, alignedContext);
+
+      auto logitsL2 = Dense("ff_logit_l2", dimTrgVoc)
+                        (logitsL1);
+
+      return std::make_tuple(logitsL2, statesOut);
+    }
+
+};
+
+typedef Seq2Seq<EncoderGNMT, DecoderGNMT> GNMT;
+
+}
diff --git a/src/optimizers/optimizers.h b/src/optimizers/optimizers.h
index 01b4aebd..b1ea10d5 100644
--- a/src/optimizers/optimizers.h
+++ b/src/optimizers/optimizers.h
@@ -4,20 +4,18 @@
 #include <memory>
 
 #include "kernels/tensor_operators.h"
+#include "training/config.h"
 #include "optimizers/clippers.h"
 
 namespace marian {
 
-// @TODO: modify computation graph to group all paramters in single matrix object.
-// This will allow to perform a single large SGD update per batch. Currently there
-// are as many updates as different parameters.
-
 class OptimizerBase {
   public:
     template <typename ...Args>
-    OptimizerBase(Args... args)
-    : clipper_(Get(keywords::clip, nullptr, args...)) {}
-    
+    OptimizerBase(float eta, Args... args)
+    : clipper_(Get(keywords::clip, nullptr, args...)),
+      eta_(eta) {}
+
     float backpropUpdate(Ptr<ExpressionGraph> graph) {
       graph->forward();
       float cost = graph->topNode()->scalar();
@@ -29,43 +27,46 @@ class OptimizerBase {
     void update(Ptr<ExpressionGraph> graph) {
       Tensor p = graph->params().vals();
       Tensor g = graph->params().grads();
-      update(p, g);  
+      update(p, g);
     }
-    
+
     void update(Tensor params, Tensor grads) {
       if(clipper_)
         clipper_->clip(grads);
       updateImpl(params, grads);
     }
-  
-  private:
-    
+
+    void updateSchedule() {
+      eta_ *= 0.5;
+      LOG(info) << "Changing learning rate to " << eta_;
+    }
+
+  protected:
+
     virtual void updateImpl(Tensor params, Tensor grads) = 0;
-    
+
     Ptr<ClipperBase> clipper_;
+    float eta_;
 };
 
 class Sgd : public OptimizerBase {
   public:
     template <typename ...Args>
-    Sgd(float eta=0.01, Args... args)
-    : OptimizerBase(args...), eta_(eta) {}
+    Sgd(float eta, Args... args)
+    : OptimizerBase(eta, args...) {}
 
   private:
     void updateImpl(Tensor params, Tensor grads) {
       Element(_1 -= eta_ * _2, params, grads);
     }
-
-    float eta_;
 };
 
 // @TODO: Add serialization for historic gradients and parameters
 class Adagrad : public OptimizerBase {
   public:
     template <typename ...Args>
-    Adagrad(float eta=0.01, Args ...args)
-    : OptimizerBase(args...),
-      eta_(eta),
+    Adagrad(float eta, Args ...args)
+    : OptimizerBase(eta, args...),
       eps_(Get(keywords::eps, 1e-8, args...))
     {}
 
@@ -80,7 +81,7 @@ class Adagrad : public OptimizerBase {
         alloc_->allocate(gt_, {1, totalSize});
         gt_->set(0);
       }
-      
+
       Element(_1 += (_2 * _2),
               gt_, grads);
 
@@ -88,7 +89,6 @@ class Adagrad : public OptimizerBase {
               params, gt_, grads);
     }
 
-    float eta_;
     float eps_;
     Ptr<TensorAllocator> alloc_;
     Tensor gt_;
@@ -100,9 +100,8 @@ class Adagrad : public OptimizerBase {
 class Adam : public OptimizerBase {
   public:
     template <typename ...Args>
-    Adam(float eta = 0.0001, Args ...args)
-    : OptimizerBase(args...),
-      eta_(eta),
+    Adam(float eta, Args ...args)
+    : OptimizerBase(eta, args...),
       beta1_(Get(keywords::beta1, 0.9, args...)),
       beta2_(Get(keywords::beta2, 0.999, args...)),
       eps_(Get(keywords::eps, 1e-8, args...)),
@@ -110,7 +109,7 @@ class Adam : public OptimizerBase {
     {}
 
     void updateImpl(Tensor params, Tensor grads) {
-      
+
       if(!mtAlloc_)
         mtAlloc_ = New<TensorAllocator>(params->getDevice());
       if(!vtAlloc_)
@@ -128,9 +127,9 @@ class Adam : public OptimizerBase {
       }
 
       t_++;
-      float denom1 = 1 - pow(beta1_, t_);
-      float denom2 = 1 - pow(beta2_, t_);
-      
+      float denom1 = 1 - std::pow(beta1_, t_);
+      float denom2 = 1 - std::pow(beta2_, t_);
+
       Element(_1 = (beta1_ * _1) + ((1 - beta1_) * _2),
               mt_, grads);
       Element(_1 = (beta2_ * _1) + ((1 - beta2_) * (_2 * _2)),
@@ -141,7 +140,6 @@ class Adam : public OptimizerBase {
     }
 
   private:
-    float eta_;
     float beta1_;
     float beta2_;
     float eps_;
@@ -158,4 +156,29 @@ Ptr<OptimizerBase> Optimizer(Args&& ...args) {
   return Ptr<OptimizerBase>(new Algorithm(args...));
 }
 
+Ptr<OptimizerBase> Optimizer(Ptr<Config> options) {
+
+  Ptr<ClipperBase> clipper = nullptr;
+  float clipNorm = options->get<double>("clip-norm");
+  if(clipNorm > 0)
+    clipper = Clipper<Norm>(clipNorm);
+
+  float lrate = options->get<double>("learn-rate");
+
+  std::string opt = options->get<std::string>("optimizer");
+
+  if(opt == "sgd") {
+    return Optimizer<Sgd>(lrate, keywords::clip=clipper);
+  }
+  else if(opt == "adagrad") {
+    return Optimizer<Adagrad>(lrate, keywords::clip=clipper);
+  }
+  else if(opt == "adam") {
+    return Optimizer<Adam>(lrate, keywords::clip=clipper);
+  }
+  else {
+    UTIL_THROW2("Unknown optimizer: " << opt);
+  }
+}
+
 }
diff --git a/src/tensors/tensor.cu b/src/tensors/tensor.cu
index 19aec0cb..5c28066b 100644
--- a/src/tensors/tensor.cu
+++ b/src/tensors/tensor.cu
@@ -21,7 +21,6 @@
 
 
 #include <cuda.h>
-#include <cudnn.h>
 #include <thrust/device_ptr.h>
 #include <thrust/device_vector.h>
 
@@ -87,7 +86,7 @@ void TensorBase::set(const std::vector<float> &v) {
 
 void TensorBase::copyFrom(Tensor in) {
     cudaSetDevice(device_);
-    CUDA_CHECK(cudaMemcpy(data_, in->data(), in->size() * sizeof(float),
+    CUDA_CHECK(cudaMemcpy(data_ , in->data() , in->size() * sizeof(float),
                           cudaMemcpyDefault));
     cudaStreamSynchronize(0);
 }
@@ -100,7 +99,8 @@ std::string TensorBase::debug() {
   for(int i = 1; i < shape_.size(); ++i)
      strm << "x" << shape_[i];
   strm << " size=" << shape_.elements()
-     << " (" << shape_.elements() * sizeof(float) << "B)" << std::endl;
+     << " (" << shape_.elements() * sizeof(float) << "B)";
+  strm << " device=" << device_ << std::endl;
 
   // values
   size_t totSize = shape_.elements();
@@ -109,81 +109,90 @@ std::string TensorBase::debug() {
 
 
   strm << std::fixed << std::setprecision(8) << std::setfill(' ');
-  for(size_t k = 0; k < shape()[2]; ++k) {
-     strm << "[ ";
-     if(shape()[0] > 10) {
-        for (size_t i = 0; i < shape()[0] && i < 3; ++i) {
-           if(i > 0)
-             strm << std::endl << "  ";
-           for (size_t j = 0; j < shape()[1] && j < 3; ++j) {
-             strm << std::setw(12)
-                  << values[  i * shape().stride(0)
-                            + j * shape().stride(1)
-                            + k * shape().stride(2) ] << " ";
-           }
-           if(shape()[1] > 3)
-              strm << "... ";
-           for (size_t j = shape()[1] - 3; j < shape()[1]; ++j) {
-             strm << std::setw(12)
-                  << values[  i * shape().stride(0)
-                            + j * shape().stride(1)
-                            + k * shape().stride(2) ] << " ";
-           }
-        }
-        strm << std::endl << "  ...";
-        for (size_t i = shape()[0] - 3; i < shape()[0]; ++i) {
-           if(i > 0)
-             strm << std::endl << "  ";
-           for (size_t j = 0; j < shape()[1] && j < 3; ++j) {
-             strm << std::setw(12)
-                  << values[  i * shape().stride(0)
-                            + j * shape().stride(1)
-                            + k * shape().stride(2) ] << " ";
-           }
-           if(shape()[1] > 3)
-              strm << "... ";
-           for (size_t j = shape()[1] - 3; j < shape()[1]; ++j) {
-             strm << std::setw(12)
-                  << values[  i * shape().stride(0)
-                            + j * shape().stride(1)
-                            + k * shape().stride(2) ] << " ";
-           }
-        }
-     }
-     else {
-        for (size_t i = 0; i < shape()[0] && i < 10; ++i) {
-           if(i > 0)
-             strm << std::endl << "  ";
-           for (size_t j = 0; j < shape()[1] && j < 3; ++j) {
-             strm << std::setw(12)
-                  << values[  i * shape().stride(0)
-                            + j * shape().stride(1)
-                            + k * shape().stride(2) ] << " ";
-           }
-           if(shape()[1] > 3)
-              strm << "... ";
-           for (size_t j = shape()[1] - 3; j < shape()[1]; ++j) {
-             strm << std::setw(12)
-                  << values[  i * shape().stride(0)
-                            + j * shape().stride(1)
-                            + k * shape().stride(2) ] << " ";
-           }
-        }
-     }
-     strm << "]" << std::endl;
+  for(size_t l = 0; l < shape()[3]; ++l) {
+    for(size_t k = 0; k < shape()[2]; ++k) {
+       strm << "[ ";
+       if(shape()[0] > 10) {
+          for (size_t i = 0; i < shape()[0] && i < 3; ++i) {
+             if(i > 0)
+               strm << std::endl << "  ";
+             for (size_t j = 0; j < shape()[1] && j < 3; ++j) {
+               strm << std::setw(12)
+                    << values[  i * shape().stride(0)
+                              + j * shape().stride(1)
+                              + k * shape().stride(2)
+                              + l * shape().stride(3) ] << " ";
+             }
+             if(shape()[1] > 3)
+                strm << "... ";
+             for (size_t j = shape()[1] - 3; j < shape()[1]; ++j) {
+               strm << std::setw(12)
+                    << values[  i * shape().stride(0)
+                              + j * shape().stride(1)
+                              + k * shape().stride(2)
+                              + l * shape().stride(3) ] << " ";
+             }
+          }
+          strm << std::endl << "  ...";
+          for (size_t i = shape()[0] - 3; i < shape()[0]; ++i) {
+             if(i > 0)
+               strm << std::endl << "  ";
+             for (size_t j = 0; j < shape()[1] && j < 3; ++j) {
+               strm << std::setw(12)
+                    << values[  i * shape().stride(0)
+                              + j * shape().stride(1)
+                              + k * shape().stride(2)
+                              + l * shape().stride(3) ] << " ";
+             }
+             if(shape()[1] > 3)
+                strm << "... ";
+             for (size_t j = shape()[1] - 3; j < shape()[1]; ++j) {
+               strm << std::setw(12)
+                    << values[  i * shape().stride(0)
+                              + j * shape().stride(1)
+                              + k * shape().stride(2)
+                              + l * shape().stride(3) ] << " ";
+             }
+          }
+       }
+       else {
+          for (size_t i = 0; i < shape()[0] && i < 10; ++i) {
+             if(i > 0)
+               strm << std::endl << "  ";
+             for (size_t j = 0; j < shape()[1] && j < 3; ++j) {
+               strm << std::setw(12)
+                    << values[  i * shape().stride(0)
+                              + j * shape().stride(1)
+                              + k * shape().stride(2)
+                              + l * shape().stride(3) ] << " ";
+             }
+             if(shape()[1] > 3)
+                strm << "... ";
+             for (size_t j = shape()[1] - 3; j < shape()[1]; ++j) {
+               strm << std::setw(12)
+                    << values[  i * shape().stride(0)
+                              + j * shape().stride(1)
+                              + k * shape().stride(2)
+                              + l * shape().stride(3) ] << " ";
+             }
+          }
+       }
+       strm << "]" << std::endl;
+    }
   }
   return strm.str();
 }
 
 DeviceGPU::~DeviceGPU() {
-   cudaSetDevice(device_);
-   if(data_)
-     CUDA_CHECK(cudaFree(data_));
+  cudaSetDevice(device_);
+  if(data_)
+    CUDA_CHECK(cudaFree(data_));
+  cudaDeviceSynchronize();
 }
 
 void DeviceGPU::reserve(size_t size) {
    cudaSetDevice(device_);
-   
+
    UTIL_THROW_IF2(size < size_, "New size must be larger than old size");
 
    if(data_) {
diff --git a/src/tensors/tensor.h b/src/tensors/tensor.h
index b61b2bde..78fa0660 100644
--- a/src/tensors/tensor.h
+++ b/src/tensors/tensor.h
@@ -25,9 +25,6 @@
 #include <iostream>
 #include <sstream>
 #include <iomanip>
-#ifdef CUDNN
-#include <cudnn.h>
-#endif
 
 #include "3rd_party/exception.h"
 #include "common/definitions.h"
@@ -40,30 +37,14 @@ class TensorBase : public std::enable_shared_from_this<TensorBase> {
     float* data_;
     Shape shape_;
     size_t device_;
-#ifdef CUDNN
-    cudnnTensorDescriptor_t cudnnDesc_;
-#endif
 
   public:
     TensorBase(float* data, Shape shape, size_t device)
       : data_(data), shape_(shape), device_(device)
-    {
-#ifdef CUDNN
-  cudnnCreateTensorDescriptor(&cudnnDesc_);
-  cudnnSetTensor4dDescriptorEx(cudnnDesc_, CUDNN_DATA_FLOAT,
-                               shape_[0], shape_[1],
-                               shape_[2], shape_[3],
-                               shape_.stride(0), shape_.stride(1),
-                               shape_.stride(2), shape_.stride(3));
-#endif
-    }
+    {}
 
     ~TensorBase()
-    {
-#ifdef CUDNN
-      cudnnDestroyTensorDescriptor(cudnnDesc_);
-#endif
-    }
+    {}
 
     virtual void reset(float* data) {
       data_ = data;
@@ -90,6 +71,10 @@ class TensorBase : public std::enable_shared_from_this<TensorBase> {
       return device_;
     }
 
+    Tensor subtensor(int offset, int size){
+      return Tensor(new TensorBase(data_ + offset, {1, size}, device_ ));
+    }
+
     float get(size_t i);
 
     void set(size_t i, float value);
@@ -102,12 +87,6 @@ class TensorBase : public std::enable_shared_from_this<TensorBase> {
 
     void copyFrom(Tensor);
 
-#ifdef CUDNN
-      cudnnTensorDescriptor_t& cudnn() {
-            return cudnnDesc_;
-          }
-#endif
-
     std::string debug();
 };
 
diff --git a/src/tensors/tensor_allocator.h b/src/tensors/tensor_allocator.h
index 62932ed8..35b51ccb 100644
--- a/src/tensors/tensor_allocator.h
+++ b/src/tensors/tensor_allocator.h
@@ -94,9 +94,14 @@ class TensorAllocator {
       gaps_.insert(lastGap_);
     }
 
+    ~TensorAllocator() {
+      clear();
+    }
+
     void reserve(size_t elements = 0) {
       float mult = elements / FLOATS + 1;
-      std::cerr << "Extending reserved space to " << mult * CHUNK << " MB" << std::endl;
+      LOG(memory) << "Extending reserved space to "
+        << mult * CHUNK << " MB (device " << device_.getDevice() << ")";
 
       size_t old = device_.capacity();
       float* oldStart = device_.data();
@@ -106,8 +111,8 @@ class TensorAllocator {
 
     void reserveExact(size_t elements = 0) {
       size_t mbytes = (elements * sizeof(float)) / MBYTE;
-      std::cerr << "Reserving space for " << elements
-        << " floats (" << mbytes << " MB)" << std::endl;
+      LOG(memory) << "Reserving space for " << elements
+        << " floats (" << mbytes << " MB, device " << device_.getDevice() << ")";
 
       size_t old = device_.capacity();
       float* oldStart = device_.data();
diff --git a/src/test/bn_test.cu b/src/test/bn_test.cu
new file mode 100644
index 00000000..77587e79
--- /dev/null
+++ b/src/test/bn_test.cu
@@ -0,0 +1,99 @@
+#include <iostream>
+#include <cuda.h>
+#include <string>
+#include <vector>
+#include <cmath>
+#include <random>
+#include <algorithm>
+#include <iterator>
+#include <iostream>
+#include <functional>
+
+#include "layers/generic.h"
+#include "marian.h"
+
+int main(int argc, char** argv) {
+  using namespace marian;
+  using namespace data;
+  using namespace keywords;
+
+  auto options = New<Config>(argc, argv, false);
+
+  int batchSize = 128;
+
+  std::vector<float> temp(batchSize * 3072);
+  std::vector<float> temp2(3072 * 3072);
+  std::vector<float> indeces(batchSize, 0.f);
+
+  std::random_device rnd_device;
+  // Specify the engine and distribution.
+  std::mt19937 mersenne_engine(rnd_device());
+  mersenne_engine.seed(1234);
+  std::uniform_real_distribution<float> dist(-1.f, 1.f);
+
+  auto gen = std::bind(dist, mersenne_engine);
+  std::generate(std::begin(temp), std::end(temp), gen);
+  std::generate(std::begin(temp2), std::end(temp2), gen);
+
+  {
+    auto graph = New<ExpressionGraph>();
+    graph->setDevice(0);
+    graph->reserveWorkspaceMB(128);
+
+    auto x = graph->param("x", {batchSize, 3072}, init=inits::from_vector(temp));
+    auto gamma = graph->param("gamma", {1, 3072}, init=inits::from_value(2.0));
+    auto beta = graph->param("beta", {1, 3072}, init=inits::zeros);
+
+    auto y = layer_norm(x, gamma, beta);
+
+    auto yLogitsL1 = Dense("ff_logit_l1", 512,
+                             activation=act::tanh,
+                             normalize=true)
+                         (y, y, y);
+
+    auto yLogitsL2 = Dense("ff_logit_l2", 50000)
+                         (yLogitsL1);
+
+    auto idx = graph->constant(shape={(int)indeces.size(), 1},
+                               init=inits::from_vector(indeces));
+    auto ce = cross_entropy(yLogitsL2, idx);
+    auto cost = mean(sum(ce, keywords::axis=2), keywords::axis=0);
+
+    debug(x, "x");
+    debug(gamma, "gamma");
+    debug(beta, "beta");
+
+    graph->forward();
+    graph->backward();
+  }
+
+  /*{
+    auto graph = New<ExpressionGraph>();
+    graph->setDevice(0);
+    graph->reserveWorkspaceMB(128);
+
+    auto x = graph->param("x", {batchSize, 3072}, init=inits::from_vector(temp));
+    auto gamma = graph->param("gamma", {1, 3072}, init=inits::from_value(2.0));
+    auto beta = graph->param("beta", {1, 3072}, init=inits::zeros);
+
+    auto y = layer_norm(x, gamma, beta);
+
+    auto w = graph->param("w", {3072, 3072}, init=inits::from_vector(temp2));
+
+    auto y2 = tanh(layer_norm(dot(y, w), gamma, beta));
+
+    auto idx = graph->constant(shape={(int)indeces.size(), 1},
+                               init=inits::from_vector(indeces));
+    auto ce = cross_entropy(y2, idx);
+    auto cost = mean(sum(ce, keywords::axis=2), keywords::axis=0);
+
+    debug(x, "x");
+    debug(gamma, "gamma");
+    debug(beta, "beta");
+
+    graph->forward();
+    graph->backward();
+  }*/
+
+  return 0;
+}
diff --git a/src/test/dropout_test.cu b/src/test/dropout_test.cu
index 3e4bad69..17fda988 100644
--- a/src/test/dropout_test.cu
+++ b/src/test/dropout_test.cu
@@ -6,68 +6,26 @@
 #include <boost/timer/timer.hpp>
 #include <boost/chrono.hpp>
 
-#include "tensors/tensor_allocator.h"
-#include "tensors/tensor.h"
-#include "kernels/tensor_operators.h"
-
-#include "layers/dropout.h"
-
-#include "kernels/dropout_cudnn.h"
+#include "training/config.h"
+#include "marian.h"
+#include "layers/param_initializers.h"
 
 using namespace marian;
+using namespace keywords;
 
-int main() {
-  int cudaDevice = 0;
-  TensorAllocator* params = new TensorAllocator(cudaDevice);
+int main(int argc, char** argv) {
+  auto c = New<Config>(argc, argv);
 
-  cublasHandle_t handle = create_handle(cudaDevice);
-
-  int rows = 64;
-  int cols = 2048;
-  int layers = 64;
-
-  std::cerr << "Number of elements in tensor: " << rows * cols * layers << std::endl;
-  int rep = 1000;
-  const float prob = 0.5f;
-
-  Tensor dropoutMatrix;
-  params->allocate(dropoutMatrix, {rows, cols, layers});
-
-  DropoutGenerator dropout(0);
-
-  cudaStreamSynchronize(0);
-  boost::timer::cpu_timer timer;
-
-  for (int i = 0; i < rep;++i) {
-    dropout.Generate(dropoutMatrix, prob);
+  auto g = New<ExpressionGraph>();
+  g->setDevice(0);
+  g->reserveWorkspaceMB(512);
 
+  for(int i = 0; i < 10; ++i) {
+    g->clear();
+    auto mask = g->dropout(0.2, {10, 3072});
+    debug(mask, "mask");
+    g->forward();
   }
 
-  cudaDeviceSynchronize();
-
-  std::cerr << "DropoutGenerator: " << rep << " repetitions: " << timer.format(5, "%ws") << std::endl;
-
-  Tensor cudnnInTensor, cudnnOutTensor;
-  params->allocate(cudnnInTensor, {rows, cols, layers});
-  params->allocate(cudnnOutTensor, {rows, cols, layers});
-
-  void* states_;
-  void* space_;
-  size_t spaceSize_;
-  cudnnDropoutDescriptor_t dropDesc_;
-
-  CudnnDropoutPrepare(cudnnInTensor, prob, &dropDesc_, &space_, &spaceSize_, &states_, (size_t)1234);
-  cudaStreamSynchronize(0);
-
-  cudaDeviceSynchronize();
-  timer.start();
-  for (int i = 0; i < rep; ++i) {
-    CudnnDropoutForward(dropDesc_, space_, spaceSize_, cudnnInTensor, cudnnOutTensor);
-  }
-
-  cudaDeviceSynchronize();
-  std::cerr << "CUDNN Dropout: " << rep << " repetitions: " << timer.format(5, "%ws") << std::endl;
-
-
   return 0;
 }
diff --git a/src/test/marian_test.cu b/src/test/marian_test.cu
index 529cfdb2..0d4fa378 100644
--- a/src/test/marian_test.cu
+++ b/src/test/marian_test.cu
@@ -7,38 +7,42 @@
 #include <boost/chrono.hpp>
 
 #include "marian.h"
+#include "training/config.h"
 #include "optimizers/optimizers.h"
 #include "optimizers/clippers.h"
 #include "data/batch_generator.h"
 #include "data/corpus.h"
-#include "models/nematus.h"
+#include "models/gnmt.h"
 
 int main(int argc, char** argv) {
   using namespace marian;
   using namespace data;
 
+  auto options = New<Config>(argc, argv, false);
+
   std::vector<std::string> files =
-    {"../test/mini.de",
-     "../test/mini.en"};
+    {"../testln/mini.en",
+     "../testln/mini.de"};
 
   std::vector<std::string> vocab =
-    {"../test/vocab.de.json",
-     "../test/vocab.en.json"};
+    {"../benchmark/marian32K/train.tok.true.bpe.en.json",
+     "../benchmark/marian32K/train.tok.true.bpe.de.json"};
 
-  std::vector<int> maxVocab = { 50000, 50000 };
+  YAML::Node& c = options->get();
+  c["train-sets"] = files;
+  c["vocabs"] = vocab;
 
-  auto corpus = DataSet<Corpus>(files, vocab, maxVocab, 50);
-  BatchGenerator<Corpus> bg(corpus, 10, 20);
+  auto corpus = DataSet<Corpus>(options);
+  BatchGenerator<Corpus> bg(corpus, options);
 
   auto graph = New<ExpressionGraph>();
-  graph->setDevice(std::atoi(argv[1]));
+  graph->setDevice(1);
 
-  auto nematus = New<Nematus>();
-  nematus->load(graph, "../test/model.npz");
+  auto encdec = New<GNMT>(options);
+  encdec->load(graph, "../benchmark/marian32K/modelML6.200000.npz");
 
   graph->reserveWorkspaceMB(128);
 
-  float sum = 0;
   boost::timer::cpu_timer timer;
   size_t batches = 1;
   for(int i = 0; i < 1; ++i) {
@@ -47,39 +51,15 @@ int main(int argc, char** argv) {
       auto batch = bg.next();
       batch->debug();
 
-      auto costNode = nematus->build(graph, batch);
-      for(auto p : graph->params())
-        debug(p, p->name());
+      auto costNode = encdec->build(graph, batch);
+      //for(auto p : graph->params())
+        //debug(p, p->name());
       debug(costNode, "cost");
 
-      graph->graphviz("debug.dot");
+      //graph->graphviz("debug.dot");
 
       graph->forward();
-      graph->backward();
-
-      float cost = costNode->val()->scalar();
-      sum += cost;
-
-      if(batches % 100 == 0) {
-        std::cout << std::setfill(' ')
-                  << "Epoch " << i
-                  << " Update " << batches
-                  << " Cost "   << std::setw(7) << std::setprecision(6) << cost
-                  << " UD " << timer.format(2, "%ws");
-
-        float seconds = std::stof(timer.format(5, "%w"));
-        float sentences = 100 * batch->size() / seconds;
-
-        std::cout << " " << std::setw(5)
-                  << std::setprecision(4)
-                  << sentences
-                  << " sentences/s" << std::endl;
-        timer.start();
-      }
-
-
-      if(batches % 10000 == 0)
-        nematus->save(graph, "../test/model.marian." + std::to_string(batches) + ".npz");
+      //graph->backward();
 
       batches++;
     }
diff --git a/src/test/marian_translate.cu b/src/test/marian_translate.cu
new file mode 100644
index 00000000..63c40878
--- /dev/null
+++ b/src/test/marian_translate.cu
@@ -0,0 +1,247 @@
+#include <algorithm>
+#include <chrono>
+#include <iomanip>
+#include <string>
+#include <cstdio>
+#include <boost/timer/timer.hpp>
+#include <boost/chrono.hpp>
+
+#include "marian.h"
+#include "training/config.h"
+#include "optimizers/optimizers.h"
+#include "optimizers/clippers.h"
+#include "data/batch_generator.h"
+#include "data/corpus.h"
+#include "models/gnmt.h"
+#include "translator/nth_element.h"
+#include "common/history.h"
+
+
+namespace marian {
+
+template <class Builder>
+class BeamSearch {
+  private:
+    Ptr<Builder> builder_;
+    size_t beamSize_;
+    cudaStream_t stream_{0};
+
+  public:
+    BeamSearch(Ptr<Builder> builder)
+     : builder_(builder),
+       beamSize_(12)
+    {}
+
+    Beam toHyps(const std::vector<uint> keys,
+                const std::vector<float> costs,
+                size_t vocabSize,
+                const Beam& beam) {
+      Beam newBeam;
+      for(int i = 0; i < keys.size(); ++i) {
+        int embIdx = keys[i] % vocabSize;
+        int hypIdx = keys[i] / vocabSize;
+        float cost = costs[i];
+
+        newBeam.push_back(
+          New<Hypothesis>(beam[hypIdx], embIdx, hypIdx, cost));
+      }
+      return newBeam;
+    }
+
+    Beam pruneBeam(const Beam& beam) {
+      Beam newBeam;
+      for(auto hyp : beam) {
+        if(hyp->GetWord() > 0) {
+          newBeam.push_back(hyp);
+        }
+      }
+      return newBeam;
+    }
+
+    std::tuple<std::vector<Expr>, Expr>
+    step(std::vector<Expr> hyps,
+         Expr srcContext,
+         Expr srcMask,
+         const std::vector<size_t> hypIdx = {},
+         const std::vector<size_t> embIdx = {}) {
+      using namespace keywords;
+      auto graph = hyps[0]->graph();
+
+      // @TODO: not hard-coded!
+      int dimTrgEmb_ = 512;
+      int dimTrgVoc_ = 50000;
+
+      std::vector<Expr> selectedHyps;
+      Expr selectedEmbs;
+      if(embIdx.empty()) {
+        selectedHyps = hyps;
+        selectedEmbs = graph->constant(shape={1, dimTrgEmb_},
+                                       init=inits::zeros);
+      }
+      else {
+        // @TODO : solve this better than reshaping!
+        for(auto h : hyps)
+          selectedHyps.push_back(
+            reshape(rows(h, hypIdx), {1, h->shape()[1], 1, (int)hypIdx.size()}));
+
+        auto yEmb = Embedding("Wemb_dec", dimTrgVoc_, dimTrgEmb_)(graph);
+        selectedEmbs = reshape(rows(yEmb, embIdx),
+                               {1, yEmb->shape()[1], 1, (int)embIdx.size()});
+      }
+
+      Expr logits;
+      std::vector<Expr> newHyps;
+      std::tie(logits, newHyps) = builder_->step(selectedEmbs,
+                                                 selectedHyps,
+                                                 srcContext,
+                                                 srcMask,
+                                                 true);
+      return std::make_tuple(newHyps, logsoftmax(logits));
+    }
+
+    std::tuple<std::vector<Expr>, Expr>
+    step(std::vector<Expr> hyps,
+         Expr srcContext,
+         Expr srcMask,
+         const Beam& beam) {
+
+      std::vector<size_t> hypIndeces;
+      std::vector<size_t> embIndeces;
+      std::vector<float> beamCosts;
+
+      for(auto hyp : beam) {
+        hypIndeces.push_back(hyp->GetPrevStateIndex());
+        embIndeces.push_back(hyp->GetWord());
+        beamCosts.push_back(hyp->GetCost());
+      }
+
+      auto graph = hyps[0]->graph();
+      auto costs = graph->constant(keywords::shape={1, 1, 1, (int)beamCosts.size()},
+                                   keywords::init=inits::from_vector(beamCosts));
+
+      std::vector<Expr> newHyps;
+      Expr probs;
+      std::tie(newHyps, probs) = step(hyps,
+                                      srcContext,
+                                      srcMask,
+                                      hypIndeces,
+                                      embIndeces);
+      probs = probs + costs;
+      return std::make_tuple(newHyps, probs);
+    }
+
+    Ptr<History> search(Ptr<ExpressionGraph> graph,
+                        Ptr<data::CorpusBatch> batch) {
+
+      std::vector<Expr> startStates;
+      Expr srcContext, srcMask;
+      std::tie(startStates, srcContext, srcMask)
+        = builder_->buildEncoder(graph, batch);
+
+      size_t pos = 0;
+      auto history = New<History>(0);
+      Beam beam(1, New<Hypothesis>());
+      bool first = true;
+      bool final = false;
+      std::vector<size_t> beamSizes(1, beamSize_);
+      auto nth = New<NthElement>(beamSize_, batch->size(), stream_);
+
+      history->Add(beam);
+
+      std::vector<Expr> hyps;
+      Expr probs;
+      do {
+
+        if(first) {
+          std::tie(hyps, probs) = step(startStates,
+                                       srcContext,
+                                       srcMask);
+          pos = graph->forward();
+        }
+        else {
+          std::tie(hyps, probs) = step(hyps,
+                                       srcContext,
+                                       srcMask,
+                                       beam);
+          beamSizes[0] = beam.size();
+          pos = graph->forward(pos);
+        }
+
+        size_t dimTrgVoc = probs->shape()[1];
+
+        std::vector<unsigned> outKeys;
+        std::vector<float> outCosts;
+
+        for(int i = 0; i < probs->shape()[3]; i++) {
+          probs->val()->set(i * dimTrgVoc + 1, std::numeric_limits<float>::lowest());
+        }
+
+        nth->getNBestList(beamSizes, probs->val(),
+                          outCosts, outKeys, first);
+        first = false;
+
+        beam = toHyps(outKeys, outCosts, dimTrgVoc, beam);
+        final = history->size() >= 3 * batch->words();
+        history->Add(beam, final);
+        beam = pruneBeam(beam);
+
+      } while(!beam.empty() && !final);
+
+      return history;
+    }
+};
+
+}
+
+int main(int argc, char** argv) {
+  using namespace marian;
+  using namespace data;
+
+  auto options = New<Config>(argc, argv, false);
+
+  std::vector<std::string> files =
+    {"../benchmark/marian32K/newstest2016.tok.true.bpe.en"};
+    //{"../benchmark/marian32K/test.txt"};
+
+  std::vector<std::string> vocab =
+    {"../benchmark/marian32K/train.tok.true.bpe.en.json"};
+
+  YAML::Node& c = options->get();
+  c["train-sets"] = files;
+  c["vocabs"] = vocab;
+
+  auto corpus = DataSet<Corpus>(options);
+  BatchGenerator<Corpus> bg(corpus, options);
+
+  auto graph = New<ExpressionGraph>();
+  graph->setDevice(1);
+
+  auto target = New<Vocab>();
+  target->load("../benchmark/marian32K/train.tok.true.bpe.de.json", 50000);
+
+  auto encdec = New<GNMT>(options);
+  encdec->load(graph, "../benchmark/marian32K/modelML6.200000.npz");
+
+  graph->reserveWorkspaceMB(128);
+
+  boost::timer::cpu_timer timer;
+  bg.prepare(false);
+  while(bg) {
+    auto batch = bg.next();
+    auto search = New<BeamSearch<GNMT>>(encdec);
+    auto history = search->search(graph, batch);
+
+    auto results = history->NBest(1);
+    for(auto r : results) {
+        for(auto w : r.first)
+        if(w != 0)
+          std::cout << (*target)[w] << " ";
+      //std::cout << r.second->GetCost() << std::endl;
+      std::cout << std::endl;
+    }
+  }
+  std::cerr << timer.format(5, "%ws") << std::endl;
+
+  return 0;
+
+}
diff --git a/src/test/tensor_test.cu b/src/test/tensor_test.cu
index 5ad967a3..a7851bfa 100644
--- a/src/test/tensor_test.cu
+++ b/src/test/tensor_test.cu
@@ -1,123 +1,86 @@
 #include <iostream>
 #include <boost/timer/timer.hpp>
 
-//#include "tensors/tensor_allocator.h"
-//#include "tensors/tensor_gpu.h"
-//#include "kernels/tensor_operators.h"
-//#include "kernels/thrust_functions.h"
-
-#include "data/corpus.h"
-#include "data/batch_generator.h"
+#include "tensors/tensor_allocator.h"
+#include "tensors/tensor.h"
+#include "kernels/tensor_operators.h"
+#include "kernels/thrust_functions.h"
+#include "common/logging.h"
 
 using namespace marian;
 
-
-
 int main() {
+  Logger memory{stderrLogger("memory", "[%Y-%m-%d %T] [memory] %v")};
 
-  std::vector<std::string> files =
-    {"../benchmark/train.tok.true.en",
-     "../benchmark/train.tok.true.en",
-     "../benchmark/train.tok.true.de"};
+  Ptr<TensorAllocator> params = New<TensorAllocator>(0);
 
-  std::vector<std::string> vocab =
-    {"../benchmark/train.tok.true.en.json",
-     "../benchmark/train.tok.true.en.json",
-     "../benchmark/train.tok.true.de.json"};
+  cublasHandle_t handle = create_handle(0);
 
-  std::vector<int> maxVocab = { 50000, 50000, 50000 };
+  int words = 64;
+  int batch = 128;
+  int hidden = 4096;
 
-  using namespace data;
-  auto corpus = New<Corpus>(files, vocab, maxVocab, 50);
-  BatchGenerator<Corpus> bg(corpus, 64, 20);
+  Tensor mappedState;
+  params->allocate(mappedState, {batch, hidden, 1});
+  mappedState->set(0.001);
 
-  bg.prepare();
+  Tensor mappedContext;
+  params->allocate(mappedContext, {batch, hidden, words});
+  mappedContext->set(0.001);
+
+  Tensor va;
+  params->allocate(va, {hidden, 1});
+  va->set(0.001);
+
+  Tensor out1;
+  params->allocate(out1, {batch, hidden, words});
+  out1->set(0);
+
+  Tensor gMappedState;
+  params->allocate(gMappedState, {batch, hidden, 1});
+  gMappedState->set(0);
+
+  Tensor gMappedContext;
+  params->allocate(gMappedContext, {batch, hidden, words});
+  gMappedContext->set(0.001);
+
+  Tensor gVa;
+  params->allocate(gVa, {hidden, 1});
+  va->set(0.001);
+
+  Tensor gOut1;
+  params->allocate(gOut1, {batch, hidden, words});
+  out1->set(0);
+
+  Tensor out2;
+  params->allocate(out2, {batch, 1, words});
+  out2->set(0);
+
+  boost::timer::cpu_timer timer;
+  for(int i = 0; i < 5000; ++i) {
+    Element(_1 = Tanh(_2 + _3), out1, mappedState, mappedContext);
+    Prod(handle, out2, out1, va, false, false, 0);
+    Prod(handle, gOut1, out2, va, false, true, 1.0f);
+    Prod(handle, gVa, out1, out2, true, false, 1.0f);
+    Add(_1 * (1.f - (_2 *_2)), gMappedState, out1, out1);
+    Add(_1 * (1.f - (_2 *_2)), gMappedContext, out1, out1);
+    cudaStreamSynchronize(0);
 
-  size_t i = 0;
-  size_t samples = 0;
-  while(bg) {
-    auto batch = bg.next();
-    if(i && i % 10000 == 0)
-      std::cerr << "[" << i << "/" << samples << "]" << std::endl;
     if(i % 100 == 0)
-      std::cerr << ".";
-    i++;
-
-    samples += batch->size();
+      std::cout << "." << std::flush;
   }
+  std::cout << timer.format(5, "%ws") << std::endl;
 
-
-
-
-  //TensorAllocator params = newTensorAllocator<DeviceGPU>();
-  //
-  //cublasHandle_t handle = create_handle();
-  //
-  //int words = 64;
-  //int batch = 128;
-  //int hidden = 4096;
-  //
-  //Tensor mappedState;
-  //params->allocate(mappedState, {batch, hidden, 1});
-  //mappedState->set(0.001);
-  //
-  //Tensor mappedContext;
-  //params->allocate(mappedContext, {batch, hidden, words});
-  //mappedContext->set(0.001);
-  //
-  //Tensor va;
-  //params->allocate(va, {hidden, 1});
-  //va->set(0.001);
-  //
-  //Tensor out1;
-  //params->allocate(out1, {batch, hidden, words});
-  //out1->set(0);
-  //
-  //Tensor gMappedState;
-  //params->allocate(gMappedState, {batch, hidden, 1});
-  //gMappedState->set(0);
-  //
-  //Tensor gMappedContext;
-  //params->allocate(gMappedContext, {batch, hidden, words});
-  //gMappedContext->set(0.001);
-  //
-  //Tensor gVa;
-  //params->allocate(gVa, {hidden, 1});
-  //va->set(0.001);
-  //
-  //Tensor gOut1;
-  //params->allocate(gOut1, {batch, hidden, words});
-  //out1->set(0);
-  //
-  //Tensor out2;
-  //params->allocate(out2, {batch, 1, words});
-  //out2->set(0);
-  //
-  //boost::timer::cpu_timer timer;
-  //for(int i = 0; i < 5000; ++i) {
-  //  Element(_1 = Tanh(_2 + _3), out1, mappedState, mappedContext);
-  //  Prod(handle, out2, out1, va, false, false, 0);
-  //  Prod(handle, gOut1, out2, va, false, true, 1.0f);
-  //  Prod(handle, gVa, out1, out2, true, false, 1.0f);
-  //  Add(_1 * (1.f - (_2 *_2)), gMappedState, out1, out1);
-  //  Add(_1 * (1.f - (_2 *_2)), gMappedContext, out1, out1);
-  //  cudaStreamSynchronize(0);
-  //
-  //  if(i % 100 == 0)
-  //    std::cout << "." << std::flush;
-  //}
-  //std::cout << timer.format(5, "%ws") << std::endl;
-  //
-  //boost::timer::cpu_timer timer2;
-  //for(int i = 0; i < 5000; ++i) {
-  //  Att(out2, mappedContext, mappedState, va);
-  //  AttBack(gMappedContext, gMappedState, gVa,
-  //      mappedContext, mappedState, va, out2);
-  //  cudaStreamSynchronize(0);
-  //  if(i % 100 == 0)
-  //    std::cout << "." << std::flush;
-  //}
-  //std::cout << timer2.format(5, "%ws") << std::endl;
+  boost::timer::cpu_timer timer2;
+  for(int i = 0; i < 5000; ++i) {
+    Att(va, out2, mappedContext, mappedState, nullptr);
+    AttBack(gVa, gMappedContext, gMappedState, nullptr,
+        va, mappedContext, mappedState, out2, nullptr);
+    cudaStreamSynchronize(0);
+    if(i % 100 == 0)
+      std::cout << "." << std::flush;
+  }
+  std::cout << timer2.format(5, "%ws") << std::endl;
 
   return 0;
 }
diff --git a/src/command/config.cpp b/src/training/config.cpp
similarity index 62%
rename from src/command/config.cpp
rename to src/training/config.cpp
index d806f3e4..ecd58d7a 100644
--- a/src/command/config.cpp
+++ b/src/training/config.cpp
@@ -1,8 +1,10 @@
-#include "command/config.h"
 #include <set>
 #include <string>
+#include <boost/algorithm/string.hpp>
 
+#include "training/config.h"
 #include "common/file_stream.h"
+#include "common/logging.h"
 
 #define SET_OPTION(key, type) \
 do { if(!vm_[key].defaulted() || !config_[key]) { \
@@ -14,6 +16,8 @@ do { if(vm_.count(key) > 0) { \
   config_[key] = vm_[key].as<type>(); \
 }} while(0)
 
+namespace marian {
+
 bool Config::has(const std::string& key) const {
   return config_[key];
 }
@@ -26,6 +30,10 @@ const YAML::Node& Config::get() const {
   return config_;
 }
 
+YAML::Node& Config::get() {
+  return config_;
+}
+
 void ProcessPaths(YAML::Node& node, const boost::filesystem::path& configPath, bool isPath) {
   using namespace boost::filesystem;
   std::set<std::string> paths = {"model", "trainsets", "vocabs"};
@@ -68,24 +76,18 @@ void ProcessPaths(YAML::Node& node, const boost::filesystem::path& configPath, b
 }
 
 void Config::validate() const {
-  if (has("trainsets")) {
-    std::vector<std::string> tmp = get<std::vector<std::string>>("trainsets");
-    if (tmp.size() != 2) {
-      std::cerr << "No trainsets!" << std::endl;
-      exit(1);
-    }
-  } else {
-    std::cerr << "No trainsets!" << std::endl;
-    exit(1);
+  UTIL_THROW_IF2(!has("train-sets")
+                 || get<std::vector<std::string>>("train-sets").empty(),
+                 "No train sets given in config file or on command line");
+  if(has("vocabs")) {
+    UTIL_THROW_IF2(get<std::vector<std::string>>("vocabs").size() !=
+      get<std::vector<std::string>>("train-sets").size(),
+      "There should be as many vocabularies as training sets");
   }
-  if (has("vocabs")) {
-    if (get<std::vector<std::string>>("vocabs").size() != 2) {
-      std::cerr << "No vocab files!" << std::endl;
-      exit(1);
-    }
-  } else {
-    std::cerr << "No vocab files!" << std::endl;
-    exit(1);
+  if(has("valid-sets")) {
+    UTIL_THROW_IF2(get<std::vector<std::string>>("valid-sets").size() !=
+      get<std::vector<std::string>>("train-sets").size(),
+      "There should be as many validation sets as training sets");
   }
 }
 
@@ -122,7 +124,7 @@ void OutputRec(const YAML::Node node, YAML::Emitter& out) {
   }
 }
 
-void Config::addOptions(int argc, char** argv) {
+void Config::addOptions(int argc, char** argv, bool doValidate) {
   std::string configPath;
 
   namespace po = boost::program_options;
@@ -133,50 +135,88 @@ void Config::addOptions(int argc, char** argv) {
      "Configuration file")
     ("model,m", po::value<std::string>()->default_value("./model"),
       "Path prefix for model to be saved")
-    ("device,d", po::value<std::vector<int>>()
-      ->multitoken()
-      ->default_value(std::vector<int>({0}), "0"),
-      "Use device(s) no.  arg")
     ("init,i", po::value<std::string>(),
       "Load weights from  arg  before training")
     ("overwrite", po::value<bool>()->default_value(false),
       "Overwrite model with following checkpoints")
-    ("trainsets,t", po::value<std::vector<std::string>>()->multitoken(),
+    ("train-sets,t", po::value<std::vector<std::string>>()->multitoken(),
       "Paths to training corpora: source target")
     ("vocabs,v", po::value<std::vector<std::string>>()->multitoken(),
-      "Paths to vocabulary files, have to correspond to --trainsets")
+      "Paths to vocabulary files have to correspond to --trainsets. "
+      "If this parameter is not supplied we look for vocabulary files "
+      "source.{yml,json} and target.{yml,json}. "
+      "If these files do not exists they are created.")
+    ("max-length", po::value<size_t>()->default_value(50),
+      "Maximum length of a sentence in a training sentence pair")
     ("after-epochs,e", po::value<size_t>()->default_value(0),
       "Finish after this many epochs, 0 is infinity")
     ("after-batches", po::value<size_t>()->default_value(0),
       "Finish after this many batch updates, 0 is infinity")
-    ("disp-freq", po::value<size_t>()->default_value(100),
+    ("disp-freq", po::value<size_t>()->default_value(1000),
       "Display information every  arg  updates")
-    ("save-freq", po::value<size_t>()->default_value(30000),
+    ("save-freq", po::value<size_t>()->default_value(10000),
       "Save model file every  arg  updates")
+    ("no-shuffle", po::value<bool>()->zero_tokens()->default_value(false),
+    "Skip shuffling of training data before each epoch")
     ("workspace,w", po::value<size_t>()->default_value(2048),
       "Preallocate  arg  MB of work space")
+    ("log", po::value<std::string>(),
+     "Log training process information to file given by  arg")
   ;
 
-  po::options_description hyper("Search options");
-  hyper.add_options()
-    ("max-length", po::value<size_t>()->default_value(50),
-      "Maximum length of a sentence in a training sentence pair")
-    ("mini-batch,b", po::value<int>()->default_value(40),
-      "Size of mini-batch used during update")
-    ("maxi-batch", po::value<int>()->default_value(20),
-      "Number of batches to preload for length-based sorting")
-    ("lrate,l", po::value<double>()->default_value(0.0002),
-      "Learning rate for Adam algorithm")
-    ("clip-norm", po::value<double>()->default_value(1.f),
-      "Clip gradient norm to  arg  (0 to disable)")
+  po::options_description valid("Validation set options");
+  valid.add_options()
+    ("valid-sets", po::value<std::vector<std::string>>()->multitoken(),
+      "Paths to validation corpora: source target")
+    ("valid-freq", po::value<size_t>()->default_value(10000),
+      "Validate model every  arg  updates")
+    ("valid-metrics", po::value<std::vector<std::string>>()
+      ->multitoken()
+      ->default_value(std::vector<std::string>({"cross-entropy"}),
+                      "cross-entropy"),
+      "Metric to use during validation: cross-entropy, perplexity. "
+      "Multiple metrics can be specified")
+    ("early-stopping", po::value<size_t>()->default_value(10),
+     "Stop if the first validation metric does not improve for  arg  consecutive "
+     "validation steps")
+    ("valid-log", po::value<std::string>(),
+     "Log validation scores to file given by  arg")
+  ;
+
+  po::options_description model("Model options");
+  model.add_options()
     ("dim-vocabs", po::value<std::vector<int>>()
       ->multitoken()
       ->default_value(std::vector<int>({50000, 50000}), "50000 50000"),
       "Maximum items in vocabulary ordered by rank")
     ("dim-emb", po::value<int>()->default_value(512), "Size of embedding vector")
     ("dim-rnn", po::value<int>()->default_value(1024), "Size of rnn hidden state")
-    ("no-shuffle", po::value<bool>()->zero_tokens()->default_value(false),
-    "Skip shuffling of training data before each epoch")
+    ("layers-enc", po::value<int>()->default_value(8), "Number of encoder layers")
+    ("layers-dec", po::value<int>()->default_value(8), "Number of decoder layers")
+    ("skip", po::value<bool>()->zero_tokens()->default_value(false),
+     "Use skip connections")
+    ("normalize", po::value<bool>()->zero_tokens()->default_value(false),
+     "Enable layer normalization")
+    ("dropout-rnn", po::value<float>()->default_value(0),
+     "Scaling dropout along rnn layers and time (0 = no dropout)")
+  ;
+
+  po::options_description opt("Optimizer options");
+  opt.add_options()
+    ("mini-batch,b", po::value<int>()->default_value(64),
+      "Size of mini-batch used during update")
+    ("maxi-batch", po::value<int>()->default_value(100),
+      "Number of batches to preload for length-based sorting")
+    ("optimizer,o", po::value<std::string>()->default_value("adam"),
+      "Optimization algorithm (possible values: sgd, adagrad, adam")
+    ("learn-rate,l", po::value<double>()->default_value(0.0001),
+      "Learning rate")
+    ("clip-norm", po::value<double>()->default_value(1.f),
+      "Clip gradient norm to  arg  (0 to disable)")
+    ("device,d", po::value<std::vector<int>>()
+      ->multitoken()
+      ->default_value(std::vector<int>({0}), "0"),
+      "GPUs to use for training. Asynchronous SGD is used with multiple devices.")
   ;
 
   po::options_description configuration("Configuration meta options");
@@ -191,7 +231,9 @@ void Config::addOptions(int argc, char** argv) {
 
   po::options_description cmdline_options("Allowed options");
   cmdline_options.add(general);
-  cmdline_options.add(hyper);
+  cmdline_options.add(valid);
+  cmdline_options.add(model);
+  cmdline_options.add(opt);
   cmdline_options.add(configuration);
 
   boost::program_options::variables_map vm_;
@@ -223,14 +265,25 @@ void Config::addOptions(int argc, char** argv) {
   SET_OPTION("device", std::vector<int>);
   SET_OPTION_NONDEFAULT("init", std::string);
   SET_OPTION("overwrite", bool);
+  SET_OPTION_NONDEFAULT("log", std::string);
   // SET_OPTION_NONDEFAULT("trainsets", std::vector<std::string>);
 
-  if (!vm_["trainsets"].empty()) {
-    config_["trainsets"] = vm_["trainsets"].as<std::vector<std::string>>();
+  if (!vm_["train-sets"].empty()) {
+    config_["train-sets"] = vm_["train-sets"].as<std::vector<std::string>>();
+  }
+  if (!vm_["valid-sets"].empty()) {
+    config_["valid-sets"] = vm_["valid-sets"].as<std::vector<std::string>>();
   }
   if (!vm_["vocabs"].empty()) {
     config_["vocabs"] = vm_["vocabs"].as<std::vector<std::string>>();
   }
+
+  SET_OPTION_NONDEFAULT("valid-sets", std::vector<std::string>);
+  SET_OPTION("valid-freq", size_t);
+  SET_OPTION("valid-metrics", std::vector<std::string>);
+  SET_OPTION("early-stopping", size_t);
+  SET_OPTION_NONDEFAULT("valid-log", std::string);
+
   // SET_OPTION_NONDEFAULT("vocabs", std::vector<std::string>);
   SET_OPTION("after-epochs", size_t);
   SET_OPTION("after-batches", size_t);
@@ -242,14 +295,22 @@ void Config::addOptions(int argc, char** argv) {
   SET_OPTION("max-length", size_t);
   SET_OPTION("mini-batch", int);
   SET_OPTION("maxi-batch", int);
-  SET_OPTION("lrate", double);
+  SET_OPTION("optimizer", std::string);
+  SET_OPTION("learn-rate", double);
   SET_OPTION("clip-norm", double);
   SET_OPTION("dim-vocabs", std::vector<int>);
+
+  SET_OPTION("layers-enc", int);
+  SET_OPTION("layers-dec", int);
   SET_OPTION("dim-emb", int);
   SET_OPTION("dim-rnn", int);
   SET_OPTION("no-shuffle", bool);
-  
-  validate();
+  SET_OPTION("normalize", bool);
+  SET_OPTION("dropout-rnn", float);
+  SET_OPTION("skip", bool);
+
+  if(doValidate)
+    validate();
 
   if (get<bool>("relative-paths") && !vm_["dump-config"].as<bool>())
     ProcessPaths(config_, boost::filesystem::path{configPath}.parent_path(), false);
@@ -263,9 +324,17 @@ void Config::addOptions(int argc, char** argv) {
 
 }
 
-void Config::logOptions() {
-  std::stringstream ss;
+void Config::log() {
+  createLoggers(*this);
+
   YAML::Emitter out;
   OutputRec(config_, out);
-  std::cerr << "Options: \n" << out.c_str() << std::endl;
+  std::string conf = out.c_str();
+
+  std::vector<std::string> results;
+  boost::algorithm::split(results, conf, boost::is_any_of("\n"));
+  for(auto &r : results)
+    LOG(config) << r;
+}
+
 }
diff --git a/src/command/config.h b/src/training/config.h
similarity index 68%
rename from src/command/config.h
rename to src/training/config.h
index 22220a19..8ce0cf58 100644
--- a/src/command/config.h
+++ b/src/training/config.h
@@ -1,12 +1,17 @@
 #pragma once
 
-#include <yaml-cpp/yaml.h>
 #include <boost/program_options.hpp>
 
+#include "3rd_party/yaml-cpp/yaml.h"
+#include "common/logging.h"
+
+namespace marian {
+
 class Config {
   public:
-    Config(int argc, char** argv) {
-      addOptions(argc, argv);
+    Config(int argc, char** argv, bool validate = true) {
+      addOptions(argc, argv, validate);
+      log();
     }
 
     bool has(const std::string& key) const;
@@ -19,12 +24,14 @@ class Config {
     }
 
     const YAML::Node& get() const;
+    YAML::Node& get();
+
     YAML::Node operator[](const std::string& key) const {
       return get(key);
     }
 
-    void addOptions(int argc, char** argv);
-    void logOptions();
+    void addOptions(int argc, char** argv, bool validate);
+    void log();
     void validate() const;
 
     template <class OStream>
@@ -37,3 +44,5 @@ class Config {
     std::string inputPath;
     YAML::Node config_;
 };
+
+}
diff --git a/src/parallel/graph_group.h b/src/training/graph_group.h
similarity index 53%
rename from src/parallel/graph_group.h
rename to src/training/graph_group.h
index b3f76e4b..7fd8af59 100644
--- a/src/parallel/graph_group.h
+++ b/src/training/graph_group.h
@@ -1,242 +1,267 @@
 #pragma once
 
 #include <thread>
+#include <future>
 
 #include "common/definitions.h"
 #include "3rd_party/threadpool.h"
+#include "optimizers/optimizers.h"
+#include "training/training.h"
+#include "training/validator.h"
 
 namespace marian {
-  
-class Reporter {
-  public:
-    Ptr<Config> options_;
-    
-    float costSum{0};
-    size_t epochs{1};
-    
-    size_t samples{0};
-    size_t wordsDisp{0};
-    size_t batches{0};
-    
-    boost::timer::cpu_timer timer;
-    
-  public:  
-    Reporter(Ptr<Config> options) : options_(options) {}
-    
-    void update(float cost, Ptr<data::CorpusBatch> batch) {
-      static std::mutex sMutex;
-      std::lock_guard<std::mutex> guard(sMutex);
-
-      costSum += cost;
-      samples += batch->size();
-      wordsDisp += batch->words();
-      batches++;
-      //if(options.get<size_t>("after-batches")
-      //   && batches >= options.get<size_t>("after-batches"))
-      //  break;
-  
-      if(batches % options_->get<size_t>("disp-freq") == 0) {
-        std::stringstream ss;
-        ss << "Ep. " << epochs
-           << " : Up. " << batches
-           << " : Sen. " << samples
-           << " : Cost " << std::fixed << std::setprecision(2)
-                         << costSum / options_->get<size_t>("disp-freq")
-           << " : Time " << timer.format(2, "%ws");
-  
-        float seconds = std::stof(timer.format(5, "%w"));
-        float wps = wordsDisp /   (float)seconds;
-  
-        ss << " : " << std::fixed << std::setprecision(2)
-           << wps << " words/s";
-  
-        LOG(info) << ss.str();
-  
-        timer.start();
-        costSum = 0;
-        wordsDisp = 0;
-      }
-    }
-};
 
 class GraphGroup {
   protected:
     Ptr<Config> options_;
     Ptr<Reporter> reporter_;
     Ptr<OptimizerBase> opt_;
-    
+
     std::vector<Ptr<ExpressionGraph>> graphs_;
-    
+
   public:
     GraphGroup(Ptr<Config> options)
-    : options_(options) {
-      
-      Ptr<ClipperBase> clipper = nullptr;
-      float clipNorm = options_->get<double>("clip-norm");
-      float lrate = options_->get<double>("lrate");
-      if(clipNorm > 0)
-        clipper = Clipper<Norm>(clipNorm);
-      
-      opt_ = Optimizer<Adam>(lrate,
-                             keywords::clip=clipper);
-    }
-    
+    : options_(options), opt_(Optimizer(options)) { }
+
     virtual void update(Ptr<data::CorpusBatch>) = 0;
-    
+
     virtual void setReporter(Ptr<Reporter> reporter) {
-      reporter_ = reporter;  
+      reporter_ = reporter;
     }
-    
+
+    virtual void load() = 0;
+
     virtual void save() = 0;
 };
 
 
 template <class Builder>
-class AsynchronousGraphGroup : public GraphGroup {
+class AsyncGraphGroup : public GraphGroup {
   private:
-    Ptr<Builder> builder_;
-    
+    std::vector<Ptr<Builder>> builders_;
+
     std::vector<size_t> devices_;
-    ThreadPool pool_;
-    
+
     std::vector<Ptr<ExpressionGraph>> graphs_;
-    
+
     std::mutex sync_;
-    
-    Tensor params_;
-    Ptr<TensorAllocator> paramsAlloc_;
-    
-    Tensor grads_;
-    Ptr<TensorAllocator> gradsAlloc_;
-    
+    std::vector<std::mutex> shardSync_;
+
+    std::vector<Tensor> params_;
+    std::vector<Ptr<TensorAllocator> > paramsAlloc_;
+
+    std::vector<Tensor> grads_;
+    std::vector<Ptr<TensorAllocator>> gradsAlloc_;
+
+    std::vector<Ptr<OptimizerBase>> shardOpt_;
+
+    int shardSize_;
+
+    ThreadPool pool_;
+
     void fetchParams(Tensor oldParams) {
       if(graphs_.size() < 2)
         return;
-      
+
       // @TODO read guard on parameters
-      std::lock_guard<std::mutex> guard(sync_);
-      oldParams->copyFrom(params_);
+      int pos = 0;
+
+      std::vector<std::thread> threads;
+      for (int idx = 0; idx < devices_.size(); idx++) {
+        threads.emplace_back( std::thread( [=](int idx, int pos) {
+          //individual mutex per-shard
+          std::lock_guard<std::mutex> guard( shardSync_[idx] );
+          oldParams->subtensor(pos , params_[idx]->size())->copyFrom(params_[idx]);
+        }, idx, pos) );
+
+        pos += shardSize_;
+      }
+      for (auto &&t : threads) {
+        t.join();
+      }
     }
-    
+
     void pushGradients(Tensor newGrads) {
       if(graphs_.size() < 2) {
         opt_->update(graphs_[0]);
       }
       else {
-        std::lock_guard<std::mutex> guard(sync_);
-        grads_->copyFrom(newGrads);
-        opt_->update(params_, grads_);
+        // add instead of copy?
+        std::vector<std::thread> threads;
+        int pos = 0;
+        for (int idx = 0; idx < devices_.size(); idx++) {
+          threads.emplace_back( std::thread([=](int idx, int pos) {
+            //individual mutex per-shard
+            std::lock_guard<std::mutex> guard( shardSync_[idx] );
+            grads_[idx]->copyFrom( newGrads->subtensor(pos , grads_[idx]->size() ) );
+            shardOpt_[idx]->update(params_[idx], grads_[idx]);
+
+            cudaStreamSynchronize(0);
+          } , idx, pos) );
+
+          pos += shardSize_;
+        }
+        for(auto&& t : threads)
+          t.join();
       }
     }
-    
+
     void execute(Ptr<data::CorpusBatch> batch) {
       static bool first = true;
       if(first && graphs_.size() > 1) {
-        // initialize the paramters
-        for(auto graph : graphs_) {
-          builder_->build(graph, batch);
-          graph->forward();
+        // initialize the parameters
+        for(size_t i = 0; i < graphs_.size(); ++i) {
+          builders_[i]->build(graphs_[i], batch);
+          graphs_[i]->forward();
         }
-        
-        if(!params_) {
-          paramsAlloc_ = New<TensorAllocator>(graphs_[0]->getDevice());
-    
+
+        if(params_.size() == 0) {
           int totalSize = graphs_[0]->params().vals()->size();
-          paramsAlloc_->reserveExact(totalSize);
-          paramsAlloc_->allocate(params_, {1, totalSize});
+          shardSize_ = ceil(totalSize / devices_.size());
+
+          int pos = 0;
+          //parameter sharding
+          for (auto device : devices_){
+            int __size__ = min(shardSize_, totalSize);
+            totalSize -= __size__;
+            Tensor param_;
+            Ptr<TensorAllocator> allocator_ = New<TensorAllocator>(device);
+
+            allocator_->reserveExact(__size__);
+            allocator_->allocate(param_, {1, __size__});
+            paramsAlloc_.push_back(allocator_);
+            param_->copyFrom( graphs_[0]->params().vals()->subtensor( pos , __size__ ) );
+            params_.push_back(param_);
+            pos += __size__;
+
+          }
         }
-        
-        if(!grads_) {
-          gradsAlloc_ = New<TensorAllocator>(graphs_[0]->getDevice());
-    
+        if(grads_.size() == 0) {
           int totalSize = graphs_[0]->params().vals()->size();
-          gradsAlloc_->reserveExact(totalSize);
-          gradsAlloc_->allocate(grads_, {1, totalSize});
+
+          for (auto device : devices_){
+            int __size__ = min(shardSize_, totalSize);
+            totalSize -= __size__;
+            Tensor grad_;
+            Ptr<TensorAllocator> allocator_ = New<TensorAllocator>(device);
+
+            allocator_->reserveExact(__size__);
+            allocator_->allocate(grad_, {1, __size__});
+            gradsAlloc_.push_back(allocator_);
+            grads_.push_back(grad_);
+
+          }
         }
-        
-        params_->copyFrom(graphs_[0]->params().vals());
+
         first = false;
       }
-      
+
       auto task = [this](Ptr<data::CorpusBatch> batch) {
         static size_t i = 0;
         thread_local Ptr<ExpressionGraph> graph;
+        thread_local Ptr<Builder> builder;
+        thread_local size_t t = 0;
+
         if(!graph) {
           std::lock_guard<std::mutex> lock(sync_);
-          graph = graphs_[i++];
+          graph = graphs_[i];
+          builder = builders_[i++];
         }
-        
-        builder_->build(graph, batch);
-        
+
+        builder->build(graph, batch);
         fetchParams(graph->params().vals());
-        
+
         graph->forward();
         float cost = graph->topNode()->scalar();
         graph->backward();
-        
+
+        cudaStreamSynchronize(0);
         pushGradients(graph->params().grads());
-        
+
         if(reporter_) {
+          std::lock_guard<std::mutex> guard(sync_);
           reporter_->update(cost, batch);
           if(reporter_->batches % options_->get<size_t>("save-freq") == 0)
             this->save();
+          size_t prevStalled = reporter_->stalled();
+          reporter_->validate(graph);
+          if(prevStalled < reporter_->stalled())
+            for(auto opt : shardOpt_)
+              opt->updateSchedule();
         }
+
+        t++;
       };
-      
+
       pool_.enqueue(task, batch);
-    } 
-  
-  public:
-    AsynchronousGraphGroup(Ptr<Config> options)
-     : GraphGroup(options),
-       builder_{New<Builder>(options_)},
-       devices_{options_->get<std::vector<size_t>>("device")},
-       pool_{devices_.size(), devices_.size() } {
-    
-      for(auto device : devices_) {
-        graphs_.emplace_back(New<ExpressionGraph>());
-        graphs_.back()->setDevice(device);
-        graphs_.back()->reserveWorkspaceMB(options_->get<size_t>("workspace"));
+    }
+
+    void load() {
+      if(options_->has("init")) {
+        std::string init = options_->get<std::string>("init");
+        size_t i = 0;
+        for(auto graph : graphs_)
+          builders_[i++]->load(graph, init);
       }
     }
-    
+
+  public:
+    typedef Builder builder_type;
+
+    AsyncGraphGroup(Ptr<Config> options)
+     : GraphGroup(options),
+       devices_{options_->get<std::vector<size_t>>("device")},
+       pool_{devices_.size(), devices_.size()},
+       shardSync_{devices_.size()} {
+
+      for(auto device : devices_) {
+        auto graph = New<ExpressionGraph>();
+        graph->setDevice(device);
+        graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
+        graphs_.push_back(graph);
+        shardOpt_.push_back(Optimizer(options_));
+        builders_.push_back(New<Builder>(options_));
+      }
+
+      load();
+    }
+
     void update(Ptr<data::CorpusBatch> batch) {
       execute(batch);
     }
-    
+
     void save() {
-      std::lock_guard<std::mutex> guard(sync_);
       if(options_->get<bool>("overwrite")) {
         std::string name = options_->get<std::string>("model") + ".npz";
-        builder_->save(graphs_[0], name);
+        builders_[0]->save(graphs_[0], name);
       }
       else {
         std::string name = options_->get<std::string>("model")
           + "." + std::to_string(reporter_->batches) + ".npz";
-        builder_->save(graphs_[0], name);
+        builders_[0]->save(graphs_[0], name);
       }
     }
 };
 
-  
+
 template <class Builder>
-class SynchronousGraphGroup : public GraphGroup {
+class SyncGraphGroup : public GraphGroup {
   private:
     Ptr<Builder> builder_;
     std::vector<Ptr<data::CorpusBatch>> batches_;
-    
+
     bool first_{true};
-    
+
     void accumulateGradients(Ptr<ExpressionGraph> master,
                              std::vector<Ptr<ExpressionGraph>> graphs) {
       if(graphs_.size() < 2) {
         return;
       }
-      
+
       Tensor grads = master->params().grads();
       Tensor tempGrads;
       master->tensor(tempGrads, grads->shape());
-          
+
       for(auto graph : graphs) {
         if(graph != master) {
           Tensor remoteGrads = graph->params().grads();
@@ -244,24 +269,24 @@ class SynchronousGraphGroup : public GraphGroup {
           Element(_1 += _2, grads, tempGrads);
         }
       }
-      
+
       float denom = graphs_.size();
       Element(_1 /= denom, grads);
     }
-  
+
     void distributeParameters(Ptr<ExpressionGraph> master,
                               std::vector<Ptr<ExpressionGraph>> graphs) {
       if(graphs_.size() < 2)
         return;
-      
-      Tensor params = master->params().vals();    
+
+      Tensor params = master->params().vals();
       for(auto graph : graphs) {
         if(graph != master) {
           graph->params().vals()->copyFrom(params);
         }
       }
     }
-  
+
     void execute() {
       if(first_) {
         for(auto graph : graphs_) {
@@ -271,66 +296,77 @@ class SynchronousGraphGroup : public GraphGroup {
         distributeParameters(graphs_[0], graphs_);
         first_ = false;
       }
-      
+
       auto task = [this](int i,
                          Ptr<data::CorpusBatch> batch) {
         thread_local int j = -1;
         if(j == -1)
           j = i;
         auto localGraph = this->graphs_[j];
-        
+
         builder_->build(localGraph, batch);
         localGraph->forward();
         float cost = localGraph->topNode()->scalar();
         localGraph->backward();
-        
+
         if(reporter_) {
           reporter_->update(cost, batch);
           if(reporter_->batches % options_->get<size_t>("save-freq") == 0)
             this->save();
         }
       };
-      
+
       {
         size_t workers = graphs_.size();
         ThreadPool pool(workers, workers);
-        
+
         for(int i = 0; i < batches_.size(); ++i)
           pool.enqueue(task, i % (int)workers, batches_[i]);
-      }   
+      }
       accumulateGradients(graphs_[0], graphs_);
       opt_->update(graphs_[0]);
       distributeParameters(graphs_[0], graphs_);
-      
+
       batches_.clear();
     }
-  
+
+    void load() {
+      if(options_->has("init")) {
+        std::string init = options_->get<std::string>("init");
+        for(auto graph : graphs_)
+        builder_->load(graph, init);
+      }
+    }
+
   public:
-    SynchronousGraphGroup(Ptr<Config> options)
+    typedef Builder builder_type;
+
+    SyncGraphGroup(Ptr<Config> options)
      : GraphGroup(options),
        builder_{New<Builder>(options_)} {
-        
+
       auto devices = options_->get<std::vector<size_t>>("device");
       size_t workers = devices.size();
-  
+
       for(auto device : devices) {
         graphs_.emplace_back(New<ExpressionGraph>());
         graphs_.back()->setDevice(device);
         graphs_.back()->reserveWorkspaceMB(options_->get<size_t>("workspace"));
       }
-      
+
+      load();
     }
-    
-    ~SynchronousGraphGroup() {
+
+    ~SyncGraphGroup() {
       execute();
     }
-    
+
     void update(Ptr<data::CorpusBatch> batch) {
       batches_.push_back(batch);
       if(batches_.size() == graphs_.size())
         execute();
     }
-    
+
     void save() {
       if(options_->get<bool>("overwrite")) {
         std::string name = options_->get<std::string>("model") + ".npz";
@@ -342,6 +378,7 @@ class SynchronousGraphGroup : public GraphGroup {
         builder_->save(graphs_[0], name);
       }
     }
+
 };
-  
-}
\ No newline at end of file
+
+}
diff --git a/src/training/training.h b/src/training/training.h
new file mode 100644
index 00000000..10c27d41
--- /dev/null
+++ b/src/training/training.h
@@ -0,0 +1,152 @@
+#pragma once
+
+#include "data/batch_generator.h"
+#include "data/corpus.h"
+#include "training/config.h"
+#include "training/validator.h"
+
+namespace marian {
+
+class Reporter {
+  public:
+    Ptr<Config> options_;
+    std::vector<Ptr<Validator>> validators_;
+
+    float costSum{0};
+    size_t epochs{1};
+
+    size_t samples{0};
+    size_t wordsDisp{0};
+    size_t batches{0};
+
+    boost::timer::cpu_timer timer;
+
+  public:
+    Reporter(Ptr<Config> options) : options_(options) {}
+
+    bool keepGoing() {
+      // stop if it reached the maximum number of epochs
+      if(options_->get<size_t>("after-epochs") > 0
+         && epochs > options_->get<size_t>("after-epochs"))
+        return false;
+
+      // stop if it reached the maximum number of batch updates
+      if(options_->get<size_t>("after-batches") > 0
+         && batches >= options_->get<size_t>("after-batches"))
+        return false;
+
+      // stop if the first validator did not improve for a given number of checks
+      if(options_->get<size_t>("early-stopping") > 0
+         && !validators_.empty()
+         && validators_[0]->stalled() >= options_->get<size_t>("early-stopping"))
+        return false;
+
+      return true;
+    }
+
+    void increaseEpoch() {
+      LOG(info) << "Seen " << samples << " samples";
+
+      epochs++;
+      samples = 0;
+
+      LOG(info) << "Starting epoch " << epochs;
+    }
+
+    void finished() {
+      LOG(info) << "Training finshed";
+    }
+
+    void addValidator(Ptr<Validator> validator) {
+      validators_.push_back(validator);
+    }
+
+    void validate(Ptr<ExpressionGraph> graph) {
+      if(batches % options_->get<size_t>("valid-freq") == 0) {
+        for(auto validator : validators_) {
+          if(validator) {
+            size_t stalledPrev = validator->stalled();
+            float value = validator->validate(graph);
+            std::stringstream ss;
+            ss << batches << " : ";
+            ss << validator->type() << " : " << value;
+            if(validator->stalled() > 0)
+              ss << " : stalled " << validator->stalled() << " times";
+            else
+              ss << " : new best";
+            LOG(valid) << ss.str();
+          }
+        }
+      }
+    }
+
+    size_t stalled() {
+      for(auto validator : validators_)
+        if(validator)
+          return validator->stalled();
+      return 0;
+    }
+
+    void update(float cost, Ptr<data::CorpusBatch> batch) {
+      costSum += cost;
+      samples += batch->size();
+      wordsDisp += batch->words();
+      batches++;
+
+      if(batches % options_->get<size_t>("disp-freq") == 0) {
+        std::stringstream ss;
+        ss << "Ep. " << epochs
+           << " : Up. " << batches
+           << " : Sen. " << samples
+           << " : Cost " << std::fixed << std::setprecision(2)
+                         << costSum / options_->get<size_t>("disp-freq")
+           << " : Time " << timer.format(2, "%ws");
+
+        float seconds = std::stof(timer.format(5, "%w"));
+        float wps = wordsDisp /   (float)seconds;
+
+        ss << " : " << std::fixed << std::setprecision(2)
+           << wps << " words/s";
+
+        LOG(info) << ss.str();
+
+        timer.start();
+        costSum = 0;
+        wordsDisp = 0;
+      }
+    }
+};
+
+template <class Model>
+void Train(Ptr<Config> options) {
+  using namespace data;
+  using namespace keywords;
+
+  auto trainCorpus = New<Corpus>(options);
+  auto batchGenerator = New<BatchGenerator<Corpus>>(trainCorpus,
+                                                    options);
+  auto reporter = New<Reporter>(options);
+
+  if(options->has("valid-sets") && options->get<size_t>("valid-freq") > 0) {
+    for(auto validator : Validators<typename Model::builder_type>(trainCorpus->getVocabs(),
+                                                                  options))
+      reporter->addValidator(validator);
+  }
+
+  auto model = New<Model>(options);
+  model->setReporter(reporter);
+
+  while(reporter->keepGoing()) {
+    batchGenerator->prepare(!options->get<bool>("no-shuffle"));
+    while(*batchGenerator && reporter->keepGoing()) {
+      auto batch = batchGenerator->next();
+      model->update(batch);
+    }
+    if(reporter->keepGoing())
+      reporter->increaseEpoch();
+  }
+  reporter->finished();
+  model->save();
+}
+
+}
diff --git a/src/training/validator.h b/src/training/validator.h
new file mode 100644
index 00000000..7a58fa02
--- /dev/null
+++ b/src/training/validator.h
@@ -0,0 +1,146 @@
+#pragma once
+
+ #include <limits>
+
+#include "training/config.h"
+#include "graph/expression_graph.h"
+#include "data/corpus.h"
+#include "data/batch_generator.h"
+
+namespace marian {
+
+  class Validator {
+    protected:
+      Ptr<Config> options_;
+      std::vector<Ptr<Vocab>> vocabs_;
+      float lastBest_;
+      size_t stalled_{0};
+
+    public:
+      Validator(std::vector<Ptr<Vocab>> vocabs,
+                Ptr<Config> options)
+       : options_(options),
+         vocabs_(vocabs),
+         lastBest_{lowerIsBetter() ?
+          std::numeric_limits<float>::max() :
+          std::numeric_limits<float>::lowest() } {
+      }
+
+      virtual std::string type() = 0;
+
+      virtual bool lowerIsBetter() {
+        return true;
+      }
+
+      size_t stalled() {
+        return stalled_;
+      }
+
+      float validate(Ptr<ExpressionGraph> graph) {
+        using namespace data;
+        auto validPaths = options_->get<std::vector<std::string>>("valid-sets");
+        auto corpus = New<Corpus>(validPaths, vocabs_, options_);
+        Ptr<BatchGenerator<Corpus>> batchGenerator
+          = New<BatchGenerator<Corpus>>(corpus, options_);
+        batchGenerator->prepare(false);
+
+        float val = validate(graph, batchGenerator);
+        if(lowerIsBetter() && lastBest_ > val ||
+           !lowerIsBetter() && lastBest_ < val) {
+            stalled_ = 0;
+            lastBest_ = val;
+        }
+        else {
+          stalled_++;
+        }
+        return val;
+      };
+
+      virtual float validate(Ptr<ExpressionGraph>,
+                             Ptr<data::BatchGenerator<data::Corpus>>) = 0;
+
+  };
+
+  template <class Builder>
+  class CrossEntropyValidator : public Validator {
+    private:
+      Ptr<Builder> builder_;
+
+    public:
+      CrossEntropyValidator(std::vector<Ptr<Vocab>> vocabs,
+                            Ptr<Config> options)
+       : Validator(vocabs, options),
+         builder_(New<Builder>(options)) {}
+
+      float validate(Ptr<ExpressionGraph> graph,
+                     Ptr<data::BatchGenerator<data::Corpus>> batchGenerator) {
+        float cost = 0;
+        size_t samples = 0;
+
+        while(*batchGenerator) {
+          auto batch = batchGenerator->next();
+          builder_->build(graph, batch);
+          graph->forward();
+
+          cost += graph->topNode()->scalar() * batch->size();
+          samples += batch->size();
+        }
+
+        return cost / samples;
+      }
+
+      std::string type() { return "cross-entropy"; }
+  };
+
+  template <class Builder>
+  class PerplexityValidator : public Validator {
+    private:
+      Ptr<Builder> builder_;
+
+    public:
+      PerplexityValidator(std::vector<Ptr<Vocab>> vocabs,
+                          Ptr<Config> options)
+       : Validator(vocabs, options),
+         builder_(New<Builder>(options)) {}
+
+      float validate(Ptr<ExpressionGraph> graph,
+                     Ptr<data::BatchGenerator<data::Corpus>> batchGenerator) {
+        float cost = 0;
+        size_t words = 0;
+
+        while(*batchGenerator) {
+          auto batch = batchGenerator->next();
+          builder_->build(graph, batch);
+          graph->forward();
+
+          cost += graph->topNode()->scalar() * batch->size();
+          words += batch->words();
+        }
+
+        return expf(cost / words);
+      }
+
+      std::string type() { return "perplexity"; }
+
+  };
+
+  template <class Builder>
+  std::vector<Ptr<Validator>> Validators(std::vector<Ptr<Vocab>> vocabs,
+                                         Ptr<Config> options) {
+    std::vector<Ptr<Validator>> validators;
+
+    auto validMetrics = options->get<std::vector<std::string>>("valid-metrics");
+    for(auto metric : validMetrics) {
+      if(metric == "cross-entropy") {
+        auto validator = New<CrossEntropyValidator<Builder>>(vocabs, options);
+        validators.push_back(validator);
+      }
+      if(metric == "perplexity") {
+        auto validator = New<PerplexityValidator<Builder>>(vocabs, options);
+        validators.push_back(validator);
+      }
+    }
+    return validators;
+  }
+
+}
diff --git a/src/translator/nth_element.cu b/src/translator/nth_element.cu
new file mode 100644
index 00000000..6d0ab77a
--- /dev/null
+++ b/src/translator/nth_element.cu
@@ -0,0 +1,360 @@
+#include <iostream>
+
+#include "translator/nth_element.h"
+
+namespace marian {
+
+void HandleError(cudaError_t err, const char *file, int line ) {
+  if (err != cudaSuccess) {
+    std::cerr << "ERROR: " << cudaGetErrorString(err) << " in " << file << " at line " << line << std::endl;
+    exit( EXIT_FAILURE );
+  }
+}
+
+#define UNROLL_MAXARG_LOOP( n, max ) \
+  if (tid < (n) && tid + (n) < ( max ) ) { \
+    if (sdata[tid + ( n ) ] > sdata[tid]) { \
+      sdata[tid] = sdata[tid + ( n ) ]; \
+      indices[tid] = indices[tid + ( n ) ]; \
+    } \
+  }
+
+#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
+
+__global__ void gMaxElement(float* d_out, int* d_ind, float* d_in, int numBatches, int* batchFirstElementIdxs) {
+  extern __shared__ float sdata[];
+  __shared__ int indices[512];
+
+  int tid = threadIdx.x;
+
+  for (int batchIdx = 0; batchIdx < numBatches; ++batchIdx) {
+    int begin = batchFirstElementIdxs[batchIdx];
+    int end = batchFirstElementIdxs[batchIdx + 1];
+
+    int i = begin + blockIdx.x * (blockDim.x * 2) + tid;
+
+    sdata[tid] = -3.40282e+38f;
+
+    if (i < end) {
+      sdata[tid] = d_in[i];
+      indices[tid] = i;
+    }
+
+    if (i + blockDim.x < end) {
+      float a = d_in[i];
+      float b = d_in[i + blockDim.x];
+      if (a > b) {
+        sdata[tid] = a;
+        indices[tid] = i;
+      } else {
+        sdata[tid] = b;
+        indices[tid] = i + blockDim.x;
+      }
+    }
+
+    while (i + 2 * gridDim.x * blockDim.x < end) {
+      i += 2 * gridDim.x * blockDim.x;
+
+      float a = d_in[i];
+      if (a > sdata[tid]) {
+        sdata[tid] = a;
+        indices[tid] = i;
+      }
+
+      if (i + blockDim.x < end) {
+        float b = d_in[i + blockDim.x];
+        if (b > sdata[tid]) {
+          sdata[tid] = b;
+          indices[tid] = i + blockDim.x;
+        }
+      }
+    }
+
+    __syncthreads();
+
+    for (int s = (blockDim.x >> 1); s > 32; s >>= 1) {
+      if (tid < s && tid + s < end) {
+        if (sdata[tid + s] > sdata[tid]) {
+          sdata[tid] = sdata[tid + s];
+          indices[tid] = indices[tid + s];
+        }
+      }
+      __syncthreads();
+    }
+
+    UNROLL_MAXARG_LOOP(32, end);
+    UNROLL_MAXARG_LOOP(16, end);
+    UNROLL_MAXARG_LOOP(8, end);
+    UNROLL_MAXARG_LOOP(4, end);
+    UNROLL_MAXARG_LOOP(2, end);
+    UNROLL_MAXARG_LOOP(1, end);
+
+    if (tid == 0) {
+      d_out[blockIdx.x + batchIdx * gridDim.x] = sdata[0];
+      d_ind[blockIdx.x + batchIdx * gridDim.x] = indices[0];
+    }
+    __syncthreads();
+  }
+}
+
+__global__ void gMaxElementUpdate(float* binCosts, int* binIdxs, float* probs, int *batchFirstElements, float* outCosts, int* outIdxs, int *cummulatedBeamSizes, int NUM_BLOCKS) {
+  extern __shared__ float sdata[];
+  __shared__ int indices[512];
+  __shared__ float bestBinCost;
+  __shared__ int bestBinCostIdx;
+
+  const int tid = threadIdx.x;
+  const int batchIdx = blockIdx.x;
+  const int N = batchFirstElements[batchIdx + 1] - batchFirstElements[batchIdx];
+  int num_bins = int(N / (2 * 512)) + int(N % (2 * 512) != 0);
+  if (num_bins > 500) {
+    num_bins = 500;
+  }
+
+  for (int pos = cummulatedBeamSizes[batchIdx]; pos < cummulatedBeamSizes[batchIdx + 1]; ++pos) {
+    int i = tid;
+
+    sdata[tid] = -3.40282e+38f;
+
+    if (i < num_bins) {
+      sdata[tid] = binCosts[batchIdx * NUM_BLOCKS + i];
+      indices[tid] = i;
+    }
+
+    if (i + blockDim.x < num_bins) {
+      float a = binCosts[batchIdx * NUM_BLOCKS + i];
+      float b = binCosts[batchIdx * NUM_BLOCKS + i + blockDim.x];
+      if (a > b) {
+        sdata[tid] = a;
+        indices[tid] = i;
+      } else {
+        sdata[tid] = b;
+        indices[tid] = i + blockDim.x;
+      }
+    }
+
+    while (i + 2 * blockDim.x < num_bins) {
+      i += 2 * blockDim.x;
+
+      float a = binCosts[batchIdx * NUM_BLOCKS + i];
+      if (a > sdata[tid]) {
+        sdata[tid] = a;
+        indices[tid] = i;
+      }
+
+      if (i + blockDim.x < num_bins) {
+        float b = binCosts[batchIdx * NUM_BLOCKS + i + blockDim.x];
+        if (b > sdata[tid]) {
+          sdata[tid] = b;
+          indices[tid] = i + blockDim.x;
+        }
+      }
+    }
+
+    __syncthreads();
+
+    for (int s = (blockDim.x >> 1); s > 32; s >>= 1) {
+      if (tid < s && tid + s < num_bins) {
+        if (sdata[tid + s] > sdata[tid]) {
+          sdata[tid] = sdata[tid + s];
+          indices[tid] = indices[tid + s];
+        }
+      }
+      __syncthreads();
+    }
+
+    UNROLL_MAXARG_LOOP(32, num_bins);
+    UNROLL_MAXARG_LOOP(16, num_bins);
+    UNROLL_MAXARG_LOOP(8, num_bins);
+    UNROLL_MAXARG_LOOP(4, num_bins);
+    UNROLL_MAXARG_LOOP(2, num_bins);
+    UNROLL_MAXARG_LOOP(1, num_bins);
+
+    if (tid == 0) {
+      bestBinCost = sdata[0];
+      bestBinCostIdx = batchIdx * NUM_BLOCKS + indices[0];
+
+      probs[binIdxs[bestBinCostIdx]] = -3.40282e+38f;
+
+      outIdxs[pos] = binIdxs[bestBinCostIdx];
+      outCosts[pos] = bestBinCost;
+    }
+
+    __syncthreads();
+
+    i = batchFirstElements[batchIdx] + (bestBinCostIdx - batchIdx * NUM_BLOCKS) * (blockDim.x * 2) + tid;
+    const int dist = num_bins * 2 * blockDim.x;
+
+    sdata[tid] = -3.40282e+38f;
+
+    if (i < batchFirstElements[batchIdx + 1]) {
+      sdata[tid] = probs[i];
+      indices[tid] = i;
+    }
+
+    if (i + blockDim.x < batchFirstElements[batchIdx + 1]) {
+      float a = probs[i];
+      float b = probs[i+blockDim.x];
+      if (a > b) {
+        sdata[tid] = a;
+        indices[tid] = i;
+      } else {
+        sdata[tid] = b;
+        indices[tid] = i + blockDim.x;
+      }
+    }
+
+    while (i + dist < batchFirstElements[batchIdx + 1]) {
+      i += dist;
+
+      float a = probs[i];
+      if (a > sdata[tid]) {
+        sdata[tid] = a;
+        indices[tid] = i;
+      }
+
+      if (i + blockDim.x < batchFirstElements[batchIdx + 1]) {
+        float b = probs[i + blockDim.x];
+        if (b > sdata[tid]) {
+          sdata[tid] = b;
+          indices[tid] = i + blockDim.x;
+        }
+      }
+    }
+
+    __syncthreads();
+
+    for (int s = (blockDim.x >> 1); s > 32; s >>= 1) {
+      if (tid < s && tid + s < batchFirstElements[batchIdx + 1]) {
+        if (sdata[tid + s] > sdata[tid]) {
+          sdata[tid] = sdata[tid + s];
+          indices[tid] = indices[tid + s];
+        }
+      }
+      __syncthreads();
+    }
+
+    UNROLL_MAXARG_LOOP(32, batchFirstElements[batchIdx + 1]);
+    UNROLL_MAXARG_LOOP(16, batchFirstElements[batchIdx + 1]);
+    UNROLL_MAXARG_LOOP(8, batchFirstElements[batchIdx + 1]);
+    UNROLL_MAXARG_LOOP(4, batchFirstElements[batchIdx + 1]);
+    UNROLL_MAXARG_LOOP(2, batchFirstElements[batchIdx + 1]);
+    UNROLL_MAXARG_LOOP(1, batchFirstElements[batchIdx + 1]);
+
+    if (tid == 0) {
+      binCosts[bestBinCostIdx] = sdata[0];
+      binIdxs[bestBinCostIdx] = indices[0];
+    }
+    __syncthreads();
+  }
+}
+
+__global__ void gGetValueByKey(float* d_in, float* d_out, int* indeces, int n)
+{
+  int tid = threadIdx.x  + blockDim.x * blockIdx.x;
+  if (tid < n) {
+    int index = indeces[tid];
+    d_out[tid] = d_in[index];
+  }
+}
+
+NthElement::NthElement(size_t maxBeamSize, size_t maxBatchSize, cudaStream_t stream)
+    : stream_(stream) ,
+      NUM_BLOCKS(std::min(500, int(maxBeamSize * 85000 / (2 * BLOCK_SIZE)) + int(maxBeamSize * 85000 % (2 * BLOCK_SIZE) != 0)))
+{
+  //std::cerr << "NthElement::NthElement" << std::endl;
+
+  HANDLE_ERROR( cudaMalloc((void**)&d_ind, maxBatchSize * NUM_BLOCKS * sizeof(int)) );
+
+  HANDLE_ERROR( cudaMalloc((void**)&d_out, maxBatchSize * NUM_BLOCKS * sizeof(float)) );
+
+  HANDLE_ERROR( cudaMalloc((void**)&d_res_idx, maxBatchSize * maxBeamSize * sizeof(int)) );
+  HANDLE_ERROR( cudaMalloc((void**)&d_res, maxBatchSize * maxBeamSize * sizeof(float)) );
+
+  HANDLE_ERROR( cudaHostAlloc((void**) &h_res, maxBeamSize * maxBatchSize* sizeof(float),
+                              cudaHostAllocDefault) );
+  HANDLE_ERROR( cudaHostAlloc((void**) &h_res_idx, maxBeamSize * maxBatchSize * sizeof(int),
+                              cudaHostAllocDefault) );
+
+  HANDLE_ERROR( cudaMalloc((void**)&d_breakdown, maxBeamSize * sizeof(float)) );
+  HANDLE_ERROR( cudaMalloc((void**)&d_batchPosition, (maxBatchSize + 1) * sizeof(int)) );
+  HANDLE_ERROR( cudaMalloc((void**)&d_cumBeamSizes, (maxBatchSize + 1) * sizeof(int)) );
+}
+
+NthElement::~NthElement()
+{
+  HANDLE_ERROR(cudaFree(d_ind));
+  HANDLE_ERROR(cudaFree(d_out));
+  HANDLE_ERROR(cudaFree(d_res_idx));
+  HANDLE_ERROR(cudaFree(d_res));
+  HANDLE_ERROR(cudaFreeHost(h_res));
+  HANDLE_ERROR(cudaFreeHost(h_res_idx));
+  HANDLE_ERROR(cudaFree(d_breakdown));
+  HANDLE_ERROR(cudaFree(d_batchPosition));
+  HANDLE_ERROR(cudaFree(d_cumBeamSizes));
+}
+
+void NthElement::getNBestList(float* probs, const std::vector<int>& batchFirstElementIdxs,
+                              const std::vector<int>& cummulatedBeamSizes)
+{
+  HANDLE_ERROR( cudaMemcpyAsync(d_batchPosition, batchFirstElementIdxs.data(), batchFirstElementIdxs.size() * sizeof(int),
+                                cudaMemcpyHostToDevice, stream_) );
+  HANDLE_ERROR( cudaMemcpyAsync(d_cumBeamSizes, cummulatedBeamSizes.data(), cummulatedBeamSizes.size() * sizeof(int),
+                                cudaMemcpyHostToDevice, stream_) );
+
+  const int numBatches = batchFirstElementIdxs.size() - 1;
+
+  gMaxElement<<<NUM_BLOCKS, BLOCK_SIZE, BLOCK_SIZE * sizeof(float), stream_>>>
+    (d_out, d_ind, probs, numBatches, d_batchPosition);
+
+  gMaxElementUpdate<<<numBatches, BLOCK_SIZE, BLOCK_SIZE * sizeof(float), stream_>>>
+    (d_out, d_ind, probs, d_batchPosition, d_res, d_res_idx, d_cumBeamSizes, NUM_BLOCKS);
+}
+
+void NthElement::getNBestList(const std::vector<size_t>& beamSizes, Tensor Probs,
+                  std::vector<float>& outCosts, std::vector<unsigned>& outKeys,
+                  const bool isFirst) {
+  std::vector<int> cummulatedBeamSizes(beamSizes.size() + 1, 0);
+  std::vector<int> batchFirstElementIdxs(beamSizes.size() + 1, 0);
+
+  const size_t vocabSize = Probs->shape()[1];
+  for (size_t i = 0; i < beamSizes.size(); ++i) {
+
+    cummulatedBeamSizes[i + 1] = cummulatedBeamSizes[i] + beamSizes[i];
+    batchFirstElementIdxs[i + 1] += ((isFirst) ? (i + 1) : cummulatedBeamSizes[i + 1]) * vocabSize;
+  }
+
+  getNBestList(Probs->data(), batchFirstElementIdxs, cummulatedBeamSizes);
+  GetPairs(cummulatedBeamSizes.back(), outKeys, outCosts);
+
+}
+
+void NthElement::GetPairs(size_t number,
+                    std::vector<unsigned>& outKeys,
+                    std::vector<float>& outValues) {
+
+  HANDLE_ERROR( cudaMemcpyAsync(h_res, d_res, number * sizeof(float),
+                                cudaMemcpyDeviceToHost, stream_) );
+  HANDLE_ERROR( cudaMemcpyAsync(h_res_idx, d_res_idx, number * sizeof(int),
+                                cudaMemcpyDeviceToHost, stream_) );
+  cudaStreamSynchronize(stream_);
+
+  for (size_t i = 0; i < number; ++i) {
+    outKeys.push_back(h_res_idx[i]);
+    outValues.push_back(h_res[i]);
+  }
+
+  lastN = number;
+}
+
+void NthElement::getValueByKey(std::vector<float>& out, float* d_in) {
+  gGetValueByKey<<<1, lastN, 0, stream_>>>
+    (d_in, d_breakdown, h_res_idx, lastN);
+
+  HANDLE_ERROR( cudaMemcpyAsync(out.data(), d_breakdown, lastN * sizeof(float),
+                                cudaMemcpyDeviceToHost, stream_) );
+  HANDLE_ERROR( cudaStreamSynchronize(stream_));
+}
+
+}
+
diff --git a/src/translator/nth_element.h b/src/translator/nth_element.h
new file mode 100644
index 00000000..9ac25da5
--- /dev/null
+++ b/src/translator/nth_element.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <vector>
+#include <algorithm>
+
+#include <cuda.h>
+#include "tensors/tensor.h"
+
+namespace marian {
+
+class NthElement {
+  public:
+    NthElement() = delete;
+    NthElement(const NthElement &copy) = delete;
+    NthElement(size_t maxBeamSize, size_t maxBatchSize, cudaStream_t stream);
+    virtual ~NthElement();
+
+    void getNBestList(float* probs, const std::vector<int>& batchFirstElementIdxs,
+                              const std::vector<int>& cummulatedBeamSizes);
+
+    void getNBestList(const std::vector<size_t>& beamSizes, Tensor Probs,
+                      std::vector<float>& outCosts, std::vector<unsigned>& outKeys,
+                      const bool isFirst=false);
+
+    void GetPairs(size_t number,
+                  std::vector<unsigned>& outKeys,
+                  std::vector<float>& outValues);
+
+    void getValueByKey(std::vector<float>& out, float* d_in);
+
+  private:
+    const int BLOCK_SIZE = 512;
+    const int NUM_BLOCKS;
+    cudaStream_t stream_;
+    int *d_ind;
+
+    float *d_out;
+
+    int   *d_res_idx;
+    float *d_res;
+
+    int   *h_res_idx;
+    float *h_res;
+
+    float  *d_breakdown;
+    int    *d_batchPosition;
+    int    *d_cumBeamSizes;
+    size_t lastN;
+};
+
+}