scaling dropout across rnn time steps and layers

This commit is contained in:
Marcin Junczys-Dowmunt 2017-02-26 20:24:17 +00:00
commit fcd99c49f7
65 changed files with 4025 additions and 1694 deletions

3
.gitignore vendored
View File

@ -40,6 +40,3 @@ build
# Examples
examples/*/*.gz
examples/mnist/*ubyte
.cproject
.project

View File

@ -5,7 +5,7 @@ project(marian CXX)
find_package(CUDA "8.0" REQUIRED)
SET(CMAKE_CXX_FLAGS " -std=c++11 -g -O3 -Wno-unused-result -Wno-deprecated -fPIC -Wno-deprecated-gpu-targets")
LIST(APPEND CUDA_NVCC_FLAGS -std=c++11; --default-stream per-thread; -g; -O3; --use_fast_math; -Xcompiler '-fPIC'; -arch=sm_35; -DCUDNN)
LIST(APPEND CUDA_NVCC_FLAGS -std=c++11; --default-stream per-thread; -g; -O3; --use_fast_math; -Xcompiler '-fPIC'; -arch=sm_35;)
SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
include_directories(${amunn_SOURCE_DIR})

View File

@ -1,7 +1,8 @@
Marian
======
[![Join the chat at https://gitter.im/MarianNMT/Lobby](https://badges.gitter.im/MarianNMT/Lobby.svg)](https://gitter.im/MarianNMT/Lobby?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
[![Join the chat at https://gitter.im/amunmt/marian](https://badges.gitter.im/amunmt/marian.svg)](https://gitter.im/amunmt/marian?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
[![Build Status](http://vali.inf.ed.ac.uk/jenkins/buildStatus/icon?job=Marian)](http://vali.inf.ed.ac.uk/jenkins/job/Marian/)
Google group for commit messages: https://groups.google.com/forum/#!forum/mariannmt
@ -17,30 +18,12 @@ Installation
Requirements:
* g++ with c++11
* CUDA and CuDNN
* CUDA
* Boost (>= 1.56)
Exporting some paths for CuDNN may be required (put it, for example, in your `.bashrc` file):
export PATH=$PATH:$HOME/.local/bin:/usr/local/cuda/bin
export LIBRARY_PATH=$LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/local/cudnn-5/lib64
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/local/cudnn-5/lib64
export CPATH=$CPATH:/usr/local/cudnn-5/include
Compilation with `cmake > 3.5`:
mkdir build
cd build
cmake ..
make -j
To compile API documentation using Doxygen, first cd to the build directory, and then:
make doc
To test, first compile, then:
cd examples/mnist
make
cd ../../build
./mnist_benchmark

163
marian/.cproject Normal file
View File

@ -0,0 +1,163 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
<storageModule moduleId="org.eclipse.cdt.core.settings">
<cconfiguration id="com.nvidia.cuda.ide.seven_five.configuration.debug.1479727693">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="com.nvidia.cuda.ide.seven_five.configuration.debug.1479727693" moduleId="org.eclipse.cdt.core.settings" name="Debug">
<externalSettings/>
<extensions>
<extension id="com.nvidia.cuda.ide.elf" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="com.nvidia.cuda.ide.cubin" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="com.nvidia.cuda.ide.macho" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.VCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="nvcc.errorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe,org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.debug" cleanCommand="rm -rf" description="" id="com.nvidia.cuda.ide.seven_five.configuration.debug.1479727693" name="Debug" parent="com.nvidia.cuda.ide.seven_five.configuration.debug">
<folderInfo id="com.nvidia.cuda.ide.seven_five.configuration.debug.1479727693." name="/" resourcePath="">
<toolChain id="com.nvidia.cuda.tools.toolchain.seven_five.exe.debug.1735809242" name="CUDA Toolkit 8.0" superClass="com.nvidia.cuda.tools.toolchain.seven_five.exe.debug">
<targetPlatform archList="all" binaryParser="com.nvidia.cuda.ide.elf;com.nvidia.cuda.ide.macho;com.nvidia.cuda.ide.cubin" id="com.nvidia.cuda.ide.targetPlatform.1814841241" isAbstract="false" name="Debug Platform" osList="linux,macosx" superClass="com.nvidia.cuda.ide.targetPlatform"/>
<builder buildPath="${workspace_loc:/marian}/Debug" id="com.nvidia.cuda.ide.builder.466223137" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="CUDA Toolkit 8.0 Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="com.nvidia.cuda.ide.builder"/>
<tool id="nvcc.compiler.base.1979453423" name="NVCC Compiler" superClass="nvcc.compiler.base">
<option id="nvcc.compiler.deviceDebug.188182034" name="Generate device debug information (-G)" superClass="nvcc.compiler.deviceDebug" value="true" valueType="boolean"/>
<option id="nvcc.compiler.option.level.1731110905" name="Generate host debug information (-g)" superClass="nvcc.compiler.option.level" value="true" valueType="boolean"/>
<option defaultValue="nvcc.compiler.optimization.level.none" id="nvcc.compiler.optimization.level.1954677201" name="Optimization Level" superClass="nvcc.compiler.optimization.level" valueType="enumerated"/>
<option id="nvcc.compiler.pic.533579278" name="Position Independent Code (-fPIC)" superClass="nvcc.compiler.pic"/>
<option id="nvcc.compiler.include.paths.1654919056" name="Include paths (-I)" superClass="nvcc.compiler.include.paths" valueType="includePath">
<listOptionValue builtIn="false" value="&quot;${workspace_loc:/}/boost/include&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:/}/src/3rd_party&quot;"/>
<listOptionValue builtIn="false" value="&quot;${workspace_loc:/}/src&quot;"/>
</option>
<option id="nvcc.compiler.cpp11option.1114260643" name="Enable C++11 support (-std=c++11)" superClass="nvcc.compiler.cpp11option" value="true" valueType="boolean"/>
<inputType id="nvcc.compiler.input.cu.111060846" superClass="nvcc.compiler.input.cu"/>
<inputType id="nvcc.compiler.input.cpp.945692641" superClass="nvcc.compiler.input.cpp"/>
<inputType id="nvcc.compiler.input.c.749588226" superClass="nvcc.compiler.input.c"/>
</tool>
<tool id="nvcc.linker.base.635344589" name="NVCC Linker" superClass="nvcc.linker.base">
<option id="nvcc.linker.option.libs.1878015233" name="Libraries (-l)" superClass="nvcc.linker.option.libs" valueType="libs">
<listOptionValue builtIn="false" value="boost_chrono"/>
<listOptionValue builtIn="false" value="boost_iostreams"/>
<listOptionValue builtIn="false" value="boost_filesystem"/>
<listOptionValue builtIn="false" value="boost_program_options"/>
<listOptionValue builtIn="false" value="boost_system"/>
<listOptionValue builtIn="false" value="boost_timer"/>
<listOptionValue builtIn="false" value="cudnn"/>
<listOptionValue builtIn="false" value="cuda"/>
<listOptionValue builtIn="false" value="cublas"/>
<listOptionValue builtIn="false" value="z"/>
</option>
<option id="nvcc.linker.option.paths.1326041662" name="Library search path (-L)" superClass="nvcc.linker.option.paths" valueType="libPaths">
<listOptionValue builtIn="false" value="&quot;${workspace_loc:/}/boost/lib64&quot;"/>
<listOptionValue builtIn="false" value="/usr/local/cuda/lib"/>
<listOptionValue builtIn="false" value="/usr/lib"/>
</option>
<inputType id="nvcc.linker.input.1742167733" superClass="nvcc.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
<tool id="nvcc.archiver.base.1766259627" name="NVCC Archiver" superClass="nvcc.archiver.base"/>
<tool id="com.nvidia.host.assembler.1563873432" name="Host Assembler" superClass="com.nvidia.host.assembler">
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.191093879" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
<fileInfo id="com.nvidia.cuda.ide.seven_five.configuration.debug.1479727693.2014034551" name="npz_converter.h" rcbsApplicability="disable" resourcePath="src/data/npz_converter.h" toolsToInvoke=""/>
<sourceEntries>
<entry excluding="src/test|src/3rd_party/spdlog/details/format.cc|src/data/npz_converter.h|src/data/npz_converter.cpp|src/xor.cu|src/tensor_test.cu|src/rnn_test.cu|src/test_nodes.cu|src/nematus_test.cu|src/tensors/bac|src/softmax_benchmark.cu|src/mnist_benchmark.cu|src/validate_encoder_decoder.cu|src/test.cu|src/validate_mnist_batch.cu|src/train_mnist.cu|src/validate_mnist.cu|src/npz_converter.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
</sourceEntries>
</configuration>
</storageModule>
<storageModule moduleId="com.nvidia.cuda.ide.build.project.ICudaProjectConfiguration">
<executable devicelink="false">
<sass major="2" minor="0"/>
<ptx major="2" minor="0"/>
</executable>
<editor-arch major="2" minor="0"/>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
<cconfiguration id="com.nvidia.cuda.ide.seven_five.configuration.release.77237983">
<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="com.nvidia.cuda.ide.seven_five.configuration.release.77237983" moduleId="org.eclipse.cdt.core.settings" name="Release">
<externalSettings/>
<extensions>
<extension id="com.nvidia.cuda.ide.elf" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="com.nvidia.cuda.ide.cubin" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="com.nvidia.cuda.ide.macho" point="org.eclipse.cdt.core.BinaryParser"/>
<extension id="org.eclipse.cdt.core.GASErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GmakeErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GLDErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.VCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="nvcc.errorParser" point="org.eclipse.cdt.core.ErrorParser"/>
<extension id="org.eclipse.cdt.core.GCCErrorParser" point="org.eclipse.cdt.core.ErrorParser"/>
</extensions>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<configuration artifactName="${ProjName}" buildArtefactType="org.eclipse.cdt.build.core.buildArtefactType.exe" buildProperties="org.eclipse.cdt.build.core.buildType=org.eclipse.cdt.build.core.buildType.release,org.eclipse.cdt.build.core.buildArtefactType=org.eclipse.cdt.build.core.buildArtefactType.exe" cleanCommand="rm -rf" description="" id="com.nvidia.cuda.ide.seven_five.configuration.release.77237983" name="Release" parent="com.nvidia.cuda.ide.seven_five.configuration.release">
<folderInfo id="com.nvidia.cuda.ide.seven_five.configuration.release.77237983." name="/" resourcePath="">
<toolChain id="com.nvidia.cuda.ide.toolchain.seven_five.exe.release.537573056" name="CUDA Toolkit 8.0" superClass="com.nvidia.cuda.ide.toolchain.seven_five.exe.release">
<targetPlatform archList="all" binaryParser="com.nvidia.cuda.ide.elf;com.nvidia.cuda.ide.macho;com.nvidia.cuda.ide.cubin" id="com.nvidia.cuda.ide.targetPlatform.1603968154" isAbstract="false" name="Debug Platform" osList="linux,macosx" superClass="com.nvidia.cuda.ide.targetPlatform"/>
<builder buildPath="${workspace_loc:/marian}/Release" id="com.nvidia.cuda.ide.builder.1512078117" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="CUDA Toolkit 8.0 Builder" superClass="com.nvidia.cuda.ide.builder"/>
<tool id="nvcc.compiler.base.6717312" name="NVCC Compiler" superClass="nvcc.compiler.base">
<option id="nvcc.compiler.deviceDebug.98602926" name="Generate device debug information (-G)" superClass="nvcc.compiler.deviceDebug"/>
<option id="nvcc.compiler.option.level.902202019" name="Generate host debug information (-g)" superClass="nvcc.compiler.option.level"/>
<option defaultValue="nvcc.compiler.optimization.level.most" id="nvcc.compiler.optimization.level.929501471" name="Optimization Level" superClass="nvcc.compiler.optimization.level" valueType="enumerated"/>
<option id="nvcc.compiler.pic.1429189596" name="Position Independent Code (-fPIC)" superClass="nvcc.compiler.pic"/>
<inputType id="nvcc.compiler.input.cu.168631664" superClass="nvcc.compiler.input.cu"/>
<inputType id="nvcc.compiler.input.cpp.2051297104" superClass="nvcc.compiler.input.cpp"/>
<inputType id="nvcc.compiler.input.c.1492088925" superClass="nvcc.compiler.input.c"/>
</tool>
<tool id="nvcc.linker.base.1475934167" name="NVCC Linker" superClass="nvcc.linker.base">
<inputType id="nvcc.linker.input.1739308440" superClass="nvcc.linker.input">
<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
<additionalInput kind="additionalinput" paths="$(LIBS)"/>
</inputType>
</tool>
<tool id="nvcc.archiver.base.1045271474" name="NVCC Archiver" superClass="nvcc.archiver.base"/>
<tool id="com.nvidia.host.assembler.1853273636" name="Host Assembler" superClass="com.nvidia.host.assembler">
<inputType id="cdt.managedbuild.tool.gnu.assembler.input.832456357" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
</tool>
</toolChain>
</folderInfo>
</configuration>
</storageModule>
<storageModule moduleId="com.nvidia.cuda.ide.build.project.ICudaProjectConfiguration">
<executable devicelink="false">
<sass major="2" minor="0"/>
<ptx major="2" minor="0"/>
</executable>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.externalSettings"/>
</cconfiguration>
</storageModule>
<storageModule moduleId="cdtBuildSystem" version="4.0.0">
<project id="marian.com.nvidia.cuda.ide.seven_five.exe.198591110" name="Executable" projectType="com.nvidia.cuda.ide.seven_five.exe"/>
</storageModule>
<storageModule moduleId="scannerConfiguration">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId=""/>
<scannerConfigBuildInfo instanceId="com.nvidia.cuda.ide.seven_five.configuration.debug.1479727693;com.nvidia.cuda.ide.seven_five.configuration.debug.1479727693.;nvcc.compiler.base.1979453423;nvcc.compiler.input.cu.111060846">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="com.nvidia.cuda.ide.build.NVCCPerProjectProfile"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="com.nvidia.cuda.ide.seven_five.configuration.debug.1479727693;com.nvidia.cuda.ide.seven_five.configuration.debug.1479727693.;nvcc.compiler.base.1979453423;nvcc.compiler.input.c.749588226">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="com.nvidia.cuda.ide.build.NVCCPerProjectProfile"/>
</scannerConfigBuildInfo>
<scannerConfigBuildInfo instanceId="com.nvidia.cuda.ide.seven_five.configuration.debug.1479727693;com.nvidia.cuda.ide.seven_five.configuration.debug.1479727693.;nvcc.compiler.base.1979453423;nvcc.compiler.input.cpp.945692641">
<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="com.nvidia.cuda.ide.build.NVCCPerProjectProfile"/>
</scannerConfigBuildInfo>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
<storageModule moduleId="refreshScope" versionNumber="2">
<configuration configurationName="Release">
<resource resourceType="PROJECT" workspacePath="/marian"/>
</configuration>
<configuration configurationName="Debug">
<resource resourceType="PROJECT" workspacePath="/marian"/>
</configuration>
</storageModule>
<storageModule moduleId="org.eclipse.cdt.internal.ui.text.commentOwnerProjectMappings"/>
<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
</cproject>

34
marian/.project Normal file
View File

@ -0,0 +1,34 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>marian</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.cdt.managedbuilder.core.genmakebuilder</name>
<triggers>clean,full,incremental,</triggers>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder</name>
<triggers>full,incremental,</triggers>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.cdt.core.cnature</nature>
<nature>org.eclipse.cdt.core.ccnature</nature>
<nature>org.eclipse.cdt.managedbuilder.core.managedBuildNature</nature>
<nature>org.eclipse.cdt.managedbuilder.core.ScannerConfigNature</nature>
</natures>
<linkedResources>
<link>
<name>src</name>
<type>2</type>
<locationURI>PARENT-1-PROJECT_LOC/src</locationURI>
</link>
</linkedResources>
</projectDescription>

View File

@ -45,6 +45,7 @@ class ThreadPool {
template<class F, class... Args>
auto enqueue(F&& f, Args&&... args)
-> std::future<typename std::result_of<F(Args...)>::type>;
~ThreadPool();
size_t getNumTasks() const {
@ -128,6 +129,3 @@ inline ThreadPool::~ThreadPool() {
worker.join();
}
}

View File

@ -13,8 +13,13 @@ cuda_add_library(marian_lib
graph/node_operators.cu
tensors/tensor.cu
kernels/tensor_operators.cu
kernels/dropout.cu
layers/param_initializers.cpp
common/utils.cpp
common/logging.cpp
common/history.cpp
training/config.cpp
translator/nth_element.cu
data/vocab.cpp
data/corpus.cpp
$<TARGET_OBJECTS:libyaml-cpp>
@ -27,30 +32,39 @@ cuda_add_executable(
test/tensor_test.cu
)
cuda_add_executable(
marian_translate
test/marian_translate.cu
)
cuda_add_executable(
marian_test
test/marian_test.cu
)
cuda_add_executable(
bn_test
test/bn_test.cu
)
cuda_add_executable(
marian
command/config.cpp
command/marian.cu
)
cuda_add_executable(
dropout_test
test/dropout_test.cu
kernels/dropout_cudnn.cu
)
target_link_libraries(marian marian_lib)
target_link_libraries(tensor_test marian_lib)
target_link_libraries(marian_test marian_lib)
target_link_libraries(dropout_test marian_lib)
target_link_libraries(marian_translate marian_lib)
target_link_libraries(bn_test marian_lib)
foreach(exec tensor_test marian_test marian dropout_test)
target_link_libraries(${exec} ${EXT_LIBS} cudnn)
foreach(exec dropout_test tensor_test marian_test marian_translate marian bn_test)
target_link_libraries(${exec} ${EXT_LIBS} curand)
cuda_add_cublas_to_target(${exec})
set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")

View File

@ -1,86 +1,13 @@
#include <algorithm>
#include <chrono>
#include <iomanip>
#include <string>
#include <cstdio>
#include <boost/timer/timer.hpp>
#include <boost/chrono.hpp>
#include <boost/program_options.hpp>
#include <thread>
#include <chrono>
#include <mutex>
#include "marian.h"
#include "optimizers/optimizers.h"
#include "optimizers/clippers.h"
#include "data/batch_generator.h"
#include "data/corpus.h"
#include "models/nematus.h"
#include "common/logging.h"
#include "command/config.h"
#include "parallel/graph_group.h"
namespace marian {
void TrainingLoop(Ptr<Config> options,
Ptr<data::BatchGenerator<data::Corpus>> batchGenerator) {
auto reporter = New<Reporter>(options);
Ptr<GraphGroup> graphGroup = New<AsynchronousGraphGroup<Nematus>>(options);
graphGroup->setReporter(reporter);
size_t epochs = 1;
size_t batches = 0;
while((options->get<size_t>("after-epochs") == 0
|| epochs <= options->get<size_t>("after-epochs")) &&
(options->get<size_t>("after-batches") == 0
|| batches < options->get<size_t>("after-batches"))) {
batchGenerator->prepare(!options->get<bool>("no-shuffle"));
boost::timer::cpu_timer timer;
while(*batchGenerator) {
auto batch = batchGenerator->next();
graphGroup->update(batch);
}
epochs++;
LOG(info) << "Starting epoch " << epochs << " after "
<< reporter->samples << " samples";
}
LOG(info) << "Training finshed";
graphGroup->save();
}
}
#include "models/gnmt.h"
int main(int argc, char** argv) {
using namespace marian;
using namespace data;
using namespace keywords;
std::shared_ptr<spdlog::logger> info;
info = spdlog::stderr_logger_mt("info");
info->set_pattern("[%Y-%m-%d %T] %v");
auto options = New<Config>(argc, argv);
std::cerr << *options << std::endl;
auto dimVocabs = options->get<std::vector<int>>("dim-vocabs");
int dimEmb = options->get<int>("dim-emb");
int dimRnn = options->get<int>("dim-rnn");
int dimBatch = options->get<int>("mini-batch");
int dimMaxiBatch = options->get<int>("maxi-batch");
auto options = New<Config>(argc, argv);;
auto trainSets = options->get<std::vector<std::string>>("trainsets");
auto vocabs = options->get<std::vector<std::string>>("vocabs");
size_t maxSentenceLength = options->get<size_t>("max-length");
auto corpus = New<Corpus>(trainSets, vocabs, dimVocabs, maxSentenceLength);
auto bg = New<BatchGenerator<Corpus>>(corpus, dimBatch, dimMaxiBatch);
TrainingLoop(options, bg);
Train<AsyncGraphGroup<GNMT>>(options);
return 0;
}

View File

@ -30,6 +30,7 @@
#include <thrust/host_vector.h>
#include "shape.h"
#include "common/logging.h"
namespace marian {
@ -93,7 +94,6 @@ namespace marian {
// An enumeration of directions
enum struct dir { forward, backward, bidirect };
/**
* @brief Defines a set of keywords.
*
@ -101,27 +101,32 @@ namespace marian {
* will result in the creation of an instance of the Keyword class.
*/
namespace keywords {
KEY(axis, int)
KEY(shape, Shape)
KEY(value, float)
KEY(prefix, std::string)
KEY(final, bool)
KEY(output_last, bool)
KEY(activation, act)
KEY(direction, dir)
KEY(mask, Expr)
KEY(init, std::function<void(Tensor)>)
KEY(axis, int);
KEY(shape, Shape);
KEY(value, float);
KEY(prefix, std::string);
KEY(final, bool);
KEY(output_last, bool);
KEY(activation, act);
KEY(direction, dir);
KEY(mask, Expr);
KEY(dropout_prob, float);
KEY(init, std::function<void(Tensor)>);
KEY(eta, float)
KEY(beta1, float)
KEY(beta2, float)
KEY(eps, float)
KEY(optimizer, Ptr<OptimizerBase>)
KEY(clip, Ptr<ClipperBase>)
KEY(batch_size, int)
KEY(max_epochs, int)
KEY(valid, Ptr<RunBase>)
KEY(eta, float);
KEY(beta1, float);
KEY(beta2, float);
KEY(eps, float);
KEY(optimizer, Ptr<OptimizerBase>);
KEY(clip, Ptr<ClipperBase>);
KEY(batch_size, int);
KEY(normalize, bool);
KEY(skip, bool);
KEY(skip_first, bool);
KEY(coverage, Expr);
KEY(max_epochs, int);
KEY(valid, Ptr<RunBase>);
}
}

10
src/common/history.cpp Normal file
View File

@ -0,0 +1,10 @@
#include "history.h"
namespace marian {
History::History(size_t lineNo)
: normalize_(true),
lineNo_(lineNo)
{}
}

79
src/common/history.h Executable file
View File

@ -0,0 +1,79 @@
#pragma once
#include <queue>
#include "hypothesis.h"
namespace marian {
class History {
private:
struct HypothesisCoord {
bool operator<(const HypothesisCoord& hc) const {
return cost < hc.cost;
}
size_t i;
size_t j;
float cost;
};
public:
History(size_t lineNo);
void Add(const Beam& beam, bool last = false) {
if (beam.back()->GetPrevHyp() != nullptr) {
for (size_t j = 0; j < beam.size(); ++j)
if(beam[j]->GetWord() == 0 || last) {
float cost = normalize_ ? beam[j]->GetCost() / history_.size() : beam[j]->GetCost();
topHyps_.push({ history_.size(), j, cost });
}
}
history_.push_back(beam);
}
size_t size() const {
return history_.size();
}
NBestList NBest(size_t n) const {
NBestList nbest;
auto topHypsCopy = topHyps_;
while (nbest.size() < n && !topHypsCopy.empty()) {
auto bestHypCoord = topHypsCopy.top();
topHypsCopy.pop();
size_t start = bestHypCoord.i;
size_t j = bestHypCoord.j;
Words targetWords;
Ptr<Hypothesis> bestHyp = history_[start][j];
while(bestHyp->GetPrevHyp() != nullptr) {
targetWords.push_back(bestHyp->GetWord());
bestHyp = bestHyp->GetPrevHyp();
}
std::reverse(targetWords.begin(), targetWords.end());
nbest.emplace_back(targetWords, history_[bestHypCoord.i][bestHypCoord.j]);
}
return nbest;
}
Result Top() const {
return NBest(1)[0];
}
size_t GetLineNum() const
{ return lineNo_; }
private:
std::vector<Beam> history_;
std::priority_queue<HypothesisCoord> topHyps_;
bool normalize_;
size_t lineNo_;
};
typedef std::vector<History> Histories;
}

58
src/common/hypothesis.h Normal file
View File

@ -0,0 +1,58 @@
#pragma once
#include <memory>
#include "common/definitions.h"
namespace marian {
class Hypothesis {
public:
Hypothesis()
: prevHyp_(nullptr),
prevIndex_(0),
word_(0),
cost_(0.0)
{}
Hypothesis(const Ptr<Hypothesis> prevHyp, size_t word, size_t prevIndex, float cost)
: prevHyp_(prevHyp),
prevIndex_(prevIndex),
word_(word),
cost_(cost)
{}
const Ptr<Hypothesis> GetPrevHyp() const {
return prevHyp_;
}
size_t GetWord() const {
return word_;
}
size_t GetPrevStateIndex() const {
return prevIndex_;
}
float GetCost() const {
return cost_;
}
std::vector<float>& GetCostBreakdown() {
return costBreakdown_;
}
private:
const Ptr<Hypothesis> prevHyp_;
const size_t prevIndex_;
const size_t word_;
const float cost_;
std::vector<float> costBreakdown_;
};
typedef std::vector<Ptr<Hypothesis>> Beam;
typedef std::vector<Beam> Beams;
typedef std::vector<size_t> Words;
typedef std::pair<Words, Ptr<Hypothesis>> Result;
typedef std::vector<Result> NBestList;
}

View File

@ -219,7 +219,7 @@ namespace keywords {
*/
#define KEY(name, value_type) \
typedef const Keyword<COMPILE_TIME_CRC32_STR(#name),value_type> name ## _k; \
name ## _k name;
name ## _k name
}

42
src/common/logging.cpp Normal file
View File

@ -0,0 +1,42 @@
#include "logging.h"
#include "training/config.h"
std::shared_ptr<spdlog::logger> stderrLogger(const std::string& name,
const std::string& pattern,
const std::vector<std::string>& files) {
std::vector<spdlog::sink_ptr> sinks;
auto stderr_sink = spdlog::sinks::stderr_sink_mt::instance();
sinks.push_back(stderr_sink);
for(auto&& file : files) {
auto file_sink = std::make_shared<spdlog::sinks::simple_file_sink_st>(file, true);
sinks.push_back(file_sink);
}
auto logger = std::make_shared<spdlog::logger>(name, begin(sinks), end(sinks));
spdlog::register_logger(logger);
logger->set_pattern(pattern);
return logger;
}
void createLoggers(const marian::Config& options) {
std::vector<std::string> generalLogs;
std::vector<std::string> validLogs;
if(options.has("log")) {
generalLogs.push_back(options.get<std::string>("log"));
validLogs.push_back(options.get<std::string>("log"));
}
if(options.has("valid-log")) {
validLogs.push_back(options.get<std::string>("valid-log"));
}
Logger info{stderrLogger("info", "[%Y-%m-%d %T] %v", generalLogs)};
Logger config{stderrLogger("config", "[%Y-%m-%d %T] [config] %v", generalLogs)};
Logger memory{stderrLogger("memory", "[%Y-%m-%d %T] [memory] %v", generalLogs)};
Logger data{stderrLogger("data", "[%Y-%m-%d %T] [data] %v", generalLogs)};
Logger valid{stderrLogger("valid", "[%Y-%m-%d %T] [valid] %v", validLogs)};
}

View File

@ -3,3 +3,15 @@
#include "spdlog/spdlog.h"
#define LOG(logger) spdlog::get(#logger)->info()
typedef std::shared_ptr<spdlog::logger> Logger;
Logger stderrLogger(const std::string&, const std::string&,
const std::vector<std::string>& = {});
namespace marian {
class Config;
}
void createLoggers(const marian::Config& options);

View File

@ -5,7 +5,8 @@
#include <boost/timer/timer.hpp>
#include "dataset.h"
#include "data/dataset.h"
#include "training/config.h"
namespace marian {
@ -21,22 +22,24 @@ class BatchGenerator {
private:
Ptr<DataSet> data_;
Ptr<Config> options_;
typename DataSet::iterator current_;
size_t batchSize_;
size_t maxiBatchSize_;
std::deque<BatchPtr> bufferedBatches_;
BatchPtr currentBatch_;
void fillBatches() {
void fillBatches(bool shuffle=true) {
auto cmp = [](const sample& a, const sample& b) {
return a[0].size() < b[0].size();
};
std::priority_queue<sample, samples, decltype(cmp)> maxiBatch(cmp);
while(current_ != data_->end() && maxiBatch.size() < maxiBatchSize_) {
int maxSize = options_->get<int>("mini-batch") * options_->get<int>("maxi-batch");
while(current_ != data_->end() && maxiBatch.size() < maxSize) {
maxiBatch.push(*current_);
current_++;
}
@ -45,7 +48,7 @@ class BatchGenerator {
while(!maxiBatch.empty()) {
batchVector.push_back(maxiBatch.top());
maxiBatch.pop();
if(batchVector.size() == batchSize_) {
if(batchVector.size() == options_->get<int>("mini-batch")) {
bufferedBatches_.push_back(data_->toBatch(batchVector));
batchVector.clear();
}
@ -53,17 +56,15 @@ class BatchGenerator {
if(!batchVector.empty())
bufferedBatches_.push_back(data_->toBatch(batchVector));
std::random_shuffle(bufferedBatches_.begin(), bufferedBatches_.end());
if(shuffle)
std::random_shuffle(bufferedBatches_.begin(), bufferedBatches_.end());
}
public:
BatchGenerator(Ptr<DataSet> data,
size_t batchSize=80,
size_t maxiBatchNum=20)
Ptr<Config> options)
: data_(data),
batchSize_(batchSize),
maxiBatchSize_(batchSize * maxiBatchNum)
{ }
options_(options) { }
operator bool() const {
return !bufferedBatches_.empty();
@ -84,8 +85,10 @@ class BatchGenerator {
void prepare(bool shuffle=true) {
if(shuffle)
data_->shuffle();
else
data_->reset();
current_ = data_->begin();
fillBatches();
fillBatches(shuffle);
}
};

View File

@ -1,5 +1,6 @@
#include <random>
#include "corpus.h"
#include "data/corpus.h"
namespace marian {
namespace data {
@ -33,20 +34,53 @@ const SentenceTuple& CorpusIterator::dereference() const {
return tup_;
}
Corpus::Corpus(const std::vector<std::string>& textPaths,
const std::vector<std::string>& vocabPaths,
const std::vector<int>& maxVocabs,
size_t maxLength)
: textPaths_(textPaths),
maxLength_(maxLength)
{
UTIL_THROW_IF2(textPaths.size() != vocabPaths.size(),
Corpus::Corpus(Ptr<Config> options)
: options_(options),
textPaths_(options_->get<std::vector<std::string>>("train-sets")),
maxLength_(options_->get<size_t>("max-length")) {
std::vector<std::string> vocabPaths;
if(options_->has("vocabs"))
vocabPaths = options_->get<std::vector<std::string>>("vocabs");
UTIL_THROW_IF2(!vocabPaths.empty() && textPaths_.size() != vocabPaths.size(),
"Number of corpus files and vocab files does not agree");
std::vector<int> maxVocabs =
options_->get<std::vector<int>>("dim-vocabs");
std::vector<Vocab> vocabs;
for(int i = 0; i < vocabPaths.size(); ++i) {
vocabs_.emplace_back(vocabPaths[i], maxVocabs[i]);
if(vocabPaths.empty()) {
for(int i = 0; i < textPaths_.size(); ++i) {
Ptr<Vocab> vocab = New<Vocab>();
vocab->loadOrCreate(textPaths_[i], maxVocabs[i]);
vocabs_.emplace_back(vocab);
}
}
else {
for(int i = 0; i < vocabPaths.size(); ++i) {
Ptr<Vocab> vocab = New<Vocab>();
vocab->load(vocabPaths[i], maxVocabs[i]);
vocabs_.emplace_back(vocab);
}
}
for(auto path : textPaths_) {
files_.emplace_back(new InputFileStream(path));
}
}
Corpus::Corpus(std::vector<std::string> paths,
std::vector<Ptr<Vocab>> vocabs,
Ptr<Config> options)
: options_(options),
textPaths_(paths),
vocabs_(vocabs),
maxLength_(options_->get<size_t>("max-length")) {
UTIL_THROW_IF2(textPaths_.size() != vocabs_.size(),
"Number of corpus files and vocab files does not agree");
for(auto path : textPaths_) {
files_.emplace_back(new InputFileStream(path));
@ -61,7 +95,7 @@ SentenceTuple Corpus::next() {
for(int i = 0; i < files_.size(); ++i) {
std::string line;
if(std::getline((std::istream&)*files_[i], line)) {
Words words = vocabs_[i](line);
Words words = (*vocabs_[i])(line);
if(words.empty())
words.push_back(0);
tup.push_back(words);
@ -82,8 +116,15 @@ void Corpus::shuffle() {
shuffleFiles(textPaths_);
}
void Corpus::reset() {
files_.clear();
for(auto& path : textPaths_) {
files_.emplace_back(new InputFileStream(path));
}
}
void Corpus::shuffleFiles(const std::vector<std::string>& paths) {
std::cerr << "Shuffling files" << std::endl;
LOG(data) << "Shuffling files";
std::vector<std::vector<std::string>> corpus;
files_.clear();
@ -129,7 +170,7 @@ void Corpus::shuffleFiles(const std::vector<std::string>& paths) {
files_.emplace_back(new InputFileStream(path));
}
std::cerr << "Done" << std::endl;
LOG(data) << "Done";
}
}

View File

@ -4,6 +4,7 @@
#include <fstream>
#include <boost/iterator/iterator_facade.hpp>
#include "training/config.h"
#include "common/definitions.h"
#include "data/vocab.h"
#include "common/file_stream.h"
@ -38,11 +39,11 @@ class CorpusBatch {
}
std::cerr << std::endl;
std::cerr << "\t m: ";
for(auto w : b.second) {
std::cerr << w << " ";
}
std::cerr << std::endl;
//std::cerr << "\t m: ";
//for(auto w : b.second) {
//std::cerr << w << " ";
//}
//std::cerr << std::endl;
}
}
}
@ -88,9 +89,11 @@ class CorpusIterator
class Corpus {
private:
Ptr<Config> options_;
std::vector<std::string> textPaths_;
std::vector<UPtr<InputFileStream>> files_;
std::vector<Vocab> vocabs_;
std::vector<Ptr<Vocab>> vocabs_;
size_t maxLength_;
void shuffleFiles(const std::vector<std::string>& paths);
@ -102,14 +105,17 @@ class Corpus {
typedef CorpusIterator iterator;
typedef SentenceTuple sample;
Corpus(const std::vector<std::string>& textPaths,
const std::vector<std::string>& vocabPaths,
const std::vector<int>& maxVocabs,
size_t maxLength = 50);
Corpus(Ptr<Config> options);
Corpus(std::vector<std::string> paths,
std::vector<Ptr<Vocab>> vocabs,
Ptr<Config> options);
sample next();
void shuffle();
void reset();
iterator begin() {
return iterator(*this);
@ -118,6 +124,10 @@ class Corpus {
iterator end() {
return iterator();
}
std::vector<Ptr<Vocab>>& getVocabs() {
return vocabs_;
}
batch_ptr toBatch(const std::vector<sample>& batchVector) {
int batchSize = batchVector.size();

View File

@ -1,188 +0,0 @@
#pragma once
// This file is part of the Marian toolkit.
// Marian is copyright (c) 2016 Marcin Junczys-Dowmunt.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include <string>
#include <iostream>
#include <fstream>
#include <vector>
#include <cassert>
#include <algorithm>
#include "dataset.h"
#include "batch_generator.h"
namespace marian {
namespace data {
/** @brief DataBase capable of reading <a href="http://yann.lecun.com/exdb/mnist/">MNIST</a> data. */
class MNIST : public DataBase {
private:
const int IMAGE_MAGIC_NUMBER;
const int LABEL_MAGIC_NUMBER;
Examples examples_;
public:
typedef Batch batch_type;
typedef std::shared_ptr<batch_type> batch_ptr;
/**
* @brief Constructs a DataBase using <a href="http://yann.lecun.com/exdb/mnist/">MNIST</a> data.
*
* @param featuresPath Path to file containing <a href="http://yann.lecun.com/exdb/mnist/">MNIST</a> feature values
* @param labelsPath Path to file containing <a href="http://yann.lecun.com/exdb/mnist/">MNIST</a> labels
*/
MNIST(const std::string& featuresPath,
const std::string& labelsPath)
: IMAGE_MAGIC_NUMBER(2051),
LABEL_MAGIC_NUMBER(2049)
{
auto features = ReadImages(featuresPath);
auto labels = ReadLabels(labelsPath);
UTIL_THROW_IF2(features.size() != labels.size(),
"Features do not match labels");
for(int i = 0; i < features.size(); ++i)
examples_.emplace_back(new Example({ features[i], labels[i] }));
}
ExampleIterator begin() const {
return ExampleIterator(examples_.begin());
}
ExampleIterator end() const {
return ExampleIterator(examples_.end());
}
void shuffle() {
std::random_shuffle(examples_.begin(), examples_.end());
}
batch_ptr toBatch(const Examples& batchVector) {
int batchSize = batchVector.size();
std::vector<int> maxDims;
for(auto& ex : batchVector) {
if(maxDims.size() < ex->size())
maxDims.resize(ex->size(), 0);
for(int i = 0; i < ex->size(); ++i) {
if((*ex)[i]->size() > maxDims[i])
maxDims[i] = (*ex)[i]->size();
}
}
batch_ptr batch(new Batch());
std::vector<Input::iterator> iterators;
for(auto& m : maxDims) {
batch->push_back(Shape({batchSize, m}));
iterators.push_back(batch->inputs().back().begin());
}
for(auto& ex : batchVector) {
for(int i = 0; i < ex->size(); ++i) {
DataPtr d = (*ex)[i];
d->resize(maxDims[i], 0.0f);
iterators[i] = std::copy(d->begin(), d->end(), iterators[i]);
}
}
return batch;
}
private:
typedef unsigned char uchar;
int reverseInt(int i) {
unsigned char c1, c2, c3, c4;
c1 = i & 255, c2 = (i >> 8) & 255, c3 = (i >> 16) & 255, c4 = (i >> 24) & 255;
return ((int)c1 << 24) + ((int)c2 << 16) + ((int)c3 << 8) + c4;
}
std::vector<DataPtr> ReadImages(const std::string& full_path) {
std::ifstream file(full_path);
UTIL_THROW_IF2(!file.is_open(),
"Cannot open file `" + full_path + "`!");
int magic_number = 0;
file.read((char *)&magic_number, sizeof(magic_number));
magic_number = reverseInt(magic_number);
UTIL_THROW_IF2(magic_number != IMAGE_MAGIC_NUMBER,
"Invalid MNIST image file!");
int number_of_images;
int n_rows = 0;
int n_cols = 0;
file.read((char *)&number_of_images, sizeof(number_of_images));
number_of_images = reverseInt(number_of_images);
file.read((char *)&n_rows, sizeof(n_rows));
n_rows = reverseInt(n_rows);
file.read((char *)&n_cols, sizeof(n_cols));
n_cols = reverseInt(n_cols);
int imgSize = n_rows * n_cols;
std::vector<DataPtr> _dataset(number_of_images);
for(int i = 0; i < number_of_images; ++i) {
_dataset[i].reset(new Data(imgSize, 0));
for (int j = 0; j < imgSize; j++) {
unsigned char pixel = 0;
file.read((char*)&pixel, sizeof(pixel));
(*_dataset[i])[j] = pixel / 255.0f;
}
}
return _dataset;
}
std::vector<DataPtr> ReadLabels(const std::string& full_path) {
std::ifstream file(full_path);
if (! file.is_open())
throw std::runtime_error("Cannot open file `" + full_path + "`!");
int magic_number = 0;
file.read((char *)&magic_number, sizeof(magic_number));
magic_number = reverseInt(magic_number);
if (magic_number != LABEL_MAGIC_NUMBER)
throw std::runtime_error("Invalid MNIST label file!");
int number_of_labels;
file.read((char *)&number_of_labels, sizeof(number_of_labels));
number_of_labels = reverseInt(number_of_labels);
std::vector<DataPtr> _dataset(number_of_labels);
for (int i = 0; i < number_of_labels; i++) {
_dataset[i].reset(new Data(1, 0.0f));
unsigned char label;
file.read((char*)&label, 1);
(*_dataset[i])[0] = label;
}
return _dataset;
}
};
} // namespace mnist
}

View File

@ -1,142 +0,0 @@
#pragma once
#include <iostream>
#include <iomanip>
#include <boost/timer/timer.hpp>
#include "common/keywords.h"
#include "common/definitions.h"
#include "graph/expression_graph.h"
#include "optimizers/optimizers.h"
#include "data/batch_generator.h"
namespace marian {
class RunBase {
public:
virtual void run() = 0;
};
typedef std::shared_ptr<RunBase> RunBasePtr;
template <class DataSet>
class Trainer : public RunBase,
public keywords::Keywords {
private:
ExpressionGraphPtr graph_;
std::shared_ptr<DataSet> dataset_;
public:
template <typename ...Args>
Trainer(ExpressionGraphPtr graph,
std::shared_ptr<DataSet> dataset,
Args... args)
: Keywords(args...),
graph_(graph),
dataset_(dataset)
{}
void run() {
using namespace data;
using namespace keywords;
boost::timer::cpu_timer trainTimer;
auto opt = Get(optimizer, Optimizer<Adam>());
auto batchSize = Get(batch_size, 200);
auto maxEpochs = Get(max_epochs, 50);
BatchGenerator<DataSet> bg(dataset_, batchSize);
auto validator = Get(valid, RunBasePtr());
size_t update = 0;
for(int epoch = 1; epoch <= maxEpochs; ++epoch) {
boost::timer::cpu_timer epochTimer;
bg.prepare();
float cost = 0;
float totalExamples = 0;
while(bg) {
auto batch = bg.next();
opt->update(graph_);
cost += graph_->get("cost")->val()->scalar() * batch->dim();
totalExamples += batch->dim();
update++;
}
cost = cost / totalExamples;
std::cerr << "Epoch: " << std::setw(std::to_string(maxEpochs).size())
<< epoch << "/" << maxEpochs << " - Update: " << update
<< " - Cost: " << std::fixed << std::setprecision(4) << cost
<< " - Time: " << epochTimer.format(2, "%ws")
<< " - " << trainTimer.format(0, "%ws") << std::endl;
if(validator)
validator->run();
}
}
};
template <class DataSet>
class Validator : public RunBase,
public keywords::Keywords {
private:
ExpressionGraphPtr graph_;
std::shared_ptr<DataSet> dataset_;
float correct(const std::vector<float> pred, const std::vector<float> labels) {
size_t num = labels.size();
size_t scores = pred.size() / num;
size_t acc = 0;
for (size_t i = 0; i < num; ++i) {
size_t proposed = 0;
for(size_t j = 0; j < scores; ++j) {
if(pred[i * scores + j] > pred[i * scores + proposed])
proposed = j;
}
acc += (proposed == labels[i]);
}
return (float)acc;
}
public:
template <typename ...Args>
Validator(ExpressionGraphPtr graph,
std::shared_ptr<DataSet> dataset,
Args... args)
: Keywords(args...),
graph_(graph),
dataset_(dataset)
{}
void run() {
using namespace data;
using namespace keywords;
auto batchSize = Get(batch_size, 200);
BatchGenerator<DataSet> bg(dataset_, batchSize);
size_t update = 0;
bg.prepare(false);
float total = 0;
float cor = 0;
while(bg) {
auto batch = bg.next();
graph_->forward();
std::vector<float> scores;
graph_->get("scores")->val()->get(scores);
cor += correct(scores, batch->inputs()[1].data());
total += batch->dim();
update++;
}
std::cerr << "Accuracy: " << cor / total << std::endl;
}
};
template <class Process, typename ...Args>
RunBasePtr Run(Args&& ...args) {
return RunBasePtr(new Process(args...));
}
}

View File

@ -7,6 +7,8 @@
typedef size_t Word;
typedef std::vector<Word> Words;
const Word EOS = 0;
const Word UNK = 1;
const Word EOS_ID = 0;
const Word UNK_ID = 1;
const std::string EOS_STR = "</s>";
const std::string UNK_STR = "<unk>";

View File

@ -1,26 +1,15 @@
#include <sstream>
#include <algorithm>
#include "data/vocab.h"
#include "common/utils.h"
#include "common/file_stream.h"
#include "3rd_party/exception.h"
#include "3rd_party/yaml-cpp/yaml.h"
#include "common/logging.h"
Vocab::Vocab(const std::string& path, int max) {
YAML::Node vocab = YAML::Load(InputFileStream(path));
for(auto&& pair : vocab) {
auto str = pair.first.as<std::string>();
auto id = pair.second.as<Word>();
if (id < (Word)max) {
str2id_[str] = id;
if(id >= id2str_.size())
id2str_.resize(id + 1);
id2str_[id] = str;
}
}
UTIL_THROW_IF2(id2str_.empty(), "Empty vocabulary " << path);
id2str_[0] = "</s>";
Vocab::Vocab() {
}
size_t Vocab::operator[](const std::string& word) const {
@ -28,7 +17,7 @@ size_t Vocab::operator[](const std::string& word) const {
if(it != str2id_.end())
return it->second;
else
return 1;
return UNK_ID;
}
Words Vocab::operator()(const std::vector<std::string>& lineTokens, bool addEOS) const {
@ -36,7 +25,7 @@ Words Vocab::operator()(const std::vector<std::string>& lineTokens, bool addEOS)
std::transform(lineTokens.begin(), lineTokens.end(), words.begin(),
[&](const std::string& w) { return (*this)[w]; });
if(addEOS)
words.push_back(EOS);
words.push_back(EOS_ID);
return words;
}
@ -49,7 +38,7 @@ Words Vocab::operator()(const std::string& line, bool addEOS) const {
std::vector<std::string> Vocab::operator()(const Words& sentence, bool ignoreEOS) const {
std::vector<std::string> decoded;
for(size_t i = 0; i < sentence.size(); ++i) {
if(sentence[i] != EOS || !ignoreEOS) {
if(sentence[i] != EOS_ID || !ignoreEOS) {
decoded.push_back((*this)[sentence[i]]);
}
}
@ -65,3 +54,91 @@ const std::string& Vocab::operator[](size_t id) const {
size_t Vocab::size() const {
return id2str_.size();
}
void Vocab::loadOrCreate(const std::string& trainPath, int max)
{
if(boost::filesystem::exists(trainPath + ".json")) {
load(trainPath + ".json", max);
return;
}
if(boost::filesystem::exists(trainPath + ".yml")) {
load(trainPath + ".yml", max);
return;
}
create(trainPath + ".yml", max, trainPath);
load(trainPath + ".yml", max);
}
void Vocab::load(const std::string& vocabPath, int max)
{
LOG(data) << "Loading vocabulary from " << vocabPath << " (max: " << max << ")";
YAML::Node vocab = YAML::Load(InputFileStream(vocabPath));
for(auto&& pair : vocab) {
auto str = pair.first.as<std::string>();
auto id = pair.second.as<Word>();
if (id < (Word)max) {
str2id_[str] = id;
if(id >= id2str_.size())
id2str_.resize(id + 1);
id2str_[id] = str;
}
}
UTIL_THROW_IF2(id2str_.empty(), "Empty vocabulary " << vocabPath);
id2str_[EOS_ID] = EOS_STR;
id2str_[UNK_ID] = UNK_STR;
}
class Vocab::VocabFreqOrderer
{
public:
bool operator()(const Vocab::Str2Id::value_type* a, const Vocab::Str2Id::value_type* b) const {
return a->second < b->second;
}
};
void Vocab::create(const std::string& vocabPath, int max, const std::string& trainPath)
{
LOG(data) << "Creating vocabulary " << vocabPath
<< " from " << trainPath << " (max: " << max << ")";
UTIL_THROW_IF2(boost::filesystem::exists(vocabPath),
"Vocab file " << vocabPath << " exist. Not overwriting");
InputFileStream trainStrm(trainPath);
Str2Id vocab;
std::string line;
while (getline((std::istream&)trainStrm, line)) {
std::vector<std::string> toks;
Split(line, toks);
for (const std::string &tok: toks) {
Str2Id::iterator iter = vocab.find(tok);
if (iter == vocab.end())
vocab[tok] = 1;
else
iter->second++;
}
}
// put into vector & sort
std::vector<const Str2Id::value_type*> vocabVec;
vocabVec.reserve(max);
for (const Str2Id::value_type &p: vocab)
vocabVec.push_back(&p);
std::sort(vocabVec.rbegin(), vocabVec.rend(), VocabFreqOrderer());
YAML::Node vocabYaml;
vocabYaml[EOS_STR] = EOS_ID;
vocabYaml[UNK_STR] = UNK_ID;
for(size_t i = 0; i < vocabVec.size(); ++i) {
const Str2Id::value_type *p = vocabVec[i];
vocabYaml[p->first] = i + 2;
}
OutputFileStream vocabStrm(vocabPath);
(std::ostream&)vocabStrm << vocabYaml;
}

View File

@ -8,7 +8,7 @@
class Vocab {
public:
Vocab(const std::string& path, int max = 50000);
Vocab();
size_t operator[](const std::string& word) const;
@ -22,7 +22,16 @@ class Vocab {
size_t size() const;
void loadOrCreate(const std::string& textPath, int max);
void load(const std::string& vocabPath, int max);
void create(const std::string& vocabPath, int max, const std::string& trainPath);
private:
std::map<std::string, size_t> str2id_;
std::vector<std::string> id2str_;
typedef std::map<std::string, size_t> Str2Id;
Str2Id str2id_;
typedef std::vector<std::string> Id2Str;
Id2Str id2str_;
class VocabFreqOrderer;
};

View File

@ -23,6 +23,7 @@
#include <vector>
#include <memory>
#include <boost/functional/hash.hpp>
#include "exception.h"
@ -106,6 +107,8 @@ struct Chainable {
virtual void debug(const std::string& message) = 0;
virtual bool marked_for_debug() = 0;
virtual const std::string& debug_message() = 0;
virtual size_t hash() = 0;
};
/**

View File

@ -1,26 +1,5 @@
#pragma once
// This file is part of the Marian toolkit.
// Marian is copyright (c) 2016 Marcin Junczys-Dowmunt.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include <map>
#include <unordered_set>
#include <fstream>
@ -32,7 +11,9 @@
#include "data/batch_generator.h"
#include "tensors/tensor_allocator.h"
#include "layers/param_initializers.h"
#include "kernels/dropout.h"
#include "3rd_party/threadpool.h"
#include "3rd_party/cnpy/cnpy.h"
namespace marian {
@ -66,9 +47,10 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
Ptr<TensorAllocator> tensors_;
cublasHandle_t cublasHandle_;
curandGenerator_t curandGenerator_;
size_t device_{0};
size_t stale_{0};
std::unordered_map<size_t, Expr> hashMap_;
protected:
/** @brief Constructs a new expression graph
@ -84,17 +66,26 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
public:
~ExpressionGraph() {
clear();
}
void setDevice(size_t device = 0) {
device_ = device;
params_.init(device);
tensors_ = New<TensorAllocator>(device);
cublasHandle_ = create_handle(device);
curandGenerator_ = createCurandGenerator(device, 1234);
}
cublasHandle_t getCublasHandle() {
return cublasHandle_;
}
curandGenerator_t getCurandGenerator() {
return curandGenerator_;
}
size_t getDevice() {
return device_;
}
@ -132,26 +123,34 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
* @param batchSize XXX Marcin, could you provide a description of this param?
*/
void forward() {
size_t forward() {
params_.allocateForward();
for(auto&& tape : tapes_) {
for(auto&& v : tape) {
v->allocate();
v->init();
v->forward();
return forward(0);
}
// @TODO: should be done in node
for(auto&& child : v->children()) {
v->decreaseEdges(1);
child->decreaseEdges(1);
}
size_t forward(size_t pos) {
// @TODO: check if allocation works properly
if(v->marked_for_debug()) {
std::cerr << "Debug: " << v->debug_message() << std::endl;
std::cerr << v->val()->debug() << std::endl;
}
auto it = nodes_.begin() + pos;
while(it != nodes_.end()) {
auto v = *it;
v->allocate();
v->init();
v->forward();
// @TODO: should be done in node
for(auto&& child : v->children()) {
v->decreaseEdges(1);
child->decreaseEdges(1);
}
if(v->marked_for_debug()) {
std::cerr << "Debug: " << v->debug_message() << std::endl;
std::cerr << v->val()->debug() << std::endl;
}
it++;
}
return std::distance(nodes_.begin(), it);
}
/**
@ -172,7 +171,7 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
params_.allocateBackward();
params_.set_zero_adjoint();
for(auto&& v : topNodes_)
v->init_dependent();
@ -202,7 +201,7 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
it++;
}
}
/**
* @brief Returns a string representing this expression graph in <code>graphviz</code> notation.
*
@ -300,8 +299,6 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
*
* This method does not attach the new constant node to any existing expression graph.
*
* @param args XXX Marcin, what are args here?
*
* @return a newly constructed constant node
*/
template <typename ...Args>
@ -341,6 +338,17 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
args...);
}
template <typename ...Args>
inline Expr dropout(float prob, Shape shape) {
auto dropoutInit = [prob, this](Tensor t) {
Dropout(t, prob, getCurandGenerator());
};
return Expression<ConstantNode>(shared_from_this(),
keywords::init=dropoutInit,
keywords::shape=shape);
}
/*********************************************************/
/**
@ -387,10 +395,18 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
named_.emplace(name, e);
}
void add(Expr node) {
Expr add(Expr node) {
size_t group = 0;
size_t hash = node->hash();
auto it = hashMap_.find(hash);
if(it != hashMap_.end())
return it->second;
hashMap_[hash] = node;
node->setId(count_++);
for(auto& child: node->children()) {
group = std::max(group, tapeMap_[child] + 1);
child->increaseEdges(2);
@ -402,6 +418,8 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
tapes_[group].push_back(node);
nodes_.push_back(node);
topNodes_.insert(node);
return node;
}
void remove_top_node(Expr node) {
@ -428,18 +446,72 @@ class ExpressionGraph : public std::enable_shared_from_this<ExpressionGraph> {
inputs_.clear();
topNodes_.clear();
tensors_->clear();
hashMap_.clear();
}
Expr topNode() {
return nodes_.back();
}
void load(const std::string& name) {
using namespace keywords;
LOG(info) << "Loading model from " << name;
auto numpy = cnpy::npz_load(name);
for(auto it : numpy) {
auto name = it.first;
Shape shape;
if(it.second.shape.size() == 2) {
shape.set(0, it.second.shape[0]);
shape.set(1, it.second.shape[1]);
}
else if(it.second.shape.size() == 1) {
shape.set(0, 1);
shape.set(1, it.second.shape[0]);
}
param(name, shape,
init=inits::from_numpy(it.second));
}
}
void save(const std::string& name) {
LOG(info) << "Saving model to " << name;
unsigned shape[2];
std::string mode = "w";
cudaSetDevice(getDevice());
for(auto p : params().getMap()) {
std::vector<float> v;
p.second->val() >> v;
unsigned dim;
if(p.second->shape()[0] == 1) {
shape[0] = p.second->shape()[1];
dim = 1;
}
else {
shape[0] = p.second->shape()[0];
shape[1] = p.second->shape()[1];
dim = 2;
}
std::string pName = p.first;
cnpy::npz_save(name, pName, v.data(), shape, dim, mode);
mode = "a";
}
}
};
template <class T, typename ...Args>
Expr Expression(Args&& ... args) {
// @TODO check hash, if exists do not add and return
// cached node to minimize calculations
auto e = Expr(new T(std::forward<Args>(args)...));
e->graph()->add(e);
return e;
return e->graph()->add(e);
}
}

View File

@ -118,11 +118,6 @@ Expr tanh(const std::vector<Expr>& nodes) {
return Expression<TanhNodeOp>(nodes);
}
//Expr tanh(Expr a, Expr b, Expr c) {
// std::vector<Expr> nodes = {a, b, c};
// return Expression<TanhPlus3NodeOp>(nodes);
//}
Expr logit(const std::vector<Expr>&) {
UTIL_THROW2("Not implemented");
}
@ -131,5 +126,30 @@ Expr relu(const std::vector<Expr>&) {
UTIL_THROW2("Not implemented");
}
Expr sqrt(Expr a, float eps) {
return Expression<SqrtNodeOp>(a, eps);
}
Expr square(Expr a) {
return Expression<SquareNodeOp>(a);
}
Expr layer_norm(Expr x, Expr gamma, Expr beta) {
std::vector<Expr> nodes = {x, gamma};
if(beta)
nodes.push_back(beta);
return Expression<LayerNormalizationOp>(nodes);
}
//Expr batch_norm(Expr x, Expr gamma, Expr beta) {
// auto mju = mean(x, keywords::axis=0);
// auto xmmju = x - mju;
// auto std = sqrt(mean(square(xmmju), keywords::axis=0), 1e-9);
//
// if(beta)
// return gamma * (xmmju / std) + beta;
// else
// return gamma * (xmmju / std);
//}
}

View File

@ -143,4 +143,25 @@ Expr weighted_average(Expr in, Expr weights, Args ...args) {
Expr step(Expr a, size_t step);
Expr sqrt(Expr a, float eps = 0.f);
Expr square(Expr a);
Expr layer_norm(Expr x, Expr gamma, Expr beta = nullptr);
//Expr batch_norm(Expr x, Expr gamma, Expr beta = nullptr);
template <typename ...Args>
Expr dropout(Expr x, Args ...args) {
auto mask = Get(keywords::mask, nullptr, args...);
float dropout_prob = Get(keywords::dropout_prob, 0.0f, args...);
UTIL_THROW_IF2(!mask && !dropout_prob,
"Neither mask nor dropout prob given");
if(!mask) {
auto graph = x->graph();
mask = graph->dropout(dropout_prob, x->shape());
}
return x * mask;
}
}

View File

@ -186,6 +186,7 @@ class Node : public Chainable<Tensor>,
};
struct NaryNodeOp : public Node {
size_t hash_{0};
std::vector<Expr> children_;
template <typename ...Args>
@ -205,6 +206,17 @@ struct NaryNodeOp : public Node {
return children_;
}
virtual size_t hash() {
if(!hash_) {
std::size_t seed = boost::hash<std::string>()(name());
boost::hash_combine(seed, type());
for(auto child : children())
boost::hash_combine(seed, child->hash());
hash_ = seed;
}
return hash_;
}
void remove_children_from_top_nodes();
};

View File

@ -82,6 +82,11 @@ struct ConstantNode : public Node {
return "white";
}
virtual size_t hash() {
// @TODO: think of something better for constant nodes
return boost::hash<size_t>()((size_t)this);
}
private:
std::function<void(Tensor)> init_;
bool initialized_;
@ -117,6 +122,10 @@ struct ParamNode : public Node {
return "orangered";
}
virtual size_t hash() {
return boost::hash<size_t>()((size_t)this);
}
private:
std::function<void(Tensor&)> init_;
bool initialized_;

View File

@ -20,13 +20,8 @@ struct DotNodeOp : public NaryNodeOp {
auto shapeA = a->shape();
auto shapeB = b->shape();
Shape outShape;
if((shapeA[2] > 1 || shapeA[3] > 1) && shapeB[2] == 1 && shapeB[3] == 1)
outShape = {shapeA[0], shapeB[1], shapeA[2], shapeA[3]};
else {
outShape = shapeA;
outShape.set(1, shapeB[1]);
}
Shape outShape = shapeA;
outShape.set(1, shapeB[1]);
UTIL_THROW_IF2(shapeA[1] != shapeB[0],
"matrix product requires dimensions to match");
return outShape;
@ -338,6 +333,12 @@ struct ConcatenateNodeOp : public NaryNodeOp {
Deconcatenate(deconcatenees, adj_, ax_);
}
virtual size_t hash() {
size_t seed = NaryNodeOp::hash();
boost::hash_combine(seed, ax_);
return seed;
}
const std::string type() {
return "concat";
}
@ -437,5 +438,33 @@ struct AffineNodeOp : public NaryNodeOp {
}
};
struct LayerNormalizationOp : public NaryNodeOp {
LayerNormalizationOp(const std::vector<Expr>& nodes)
: NaryNodeOp(nodes) {}
NodeOps forwardOps() {
return {
NodeOp(
LayerNormalization(val_,
children_[0]->val(),
children_[1]->val(),
(children_.size() == 3) ? children_[2]->val() : nullptr))
};
}
NodeOps backwardOps() {
return {
NodeOp(LayerNormalizationGrad(children_[0]->grad(), children_[1]->grad(), (children_.size() == 3) ? children_[2]->grad() : nullptr,
adj_, val_, children_[0]->val(), children_[1]->val(),
(children_.size() == 3) ? children_[2]->val() : nullptr))
};
}
const std::string type() {
return "layer_normalization";
}
};
}

View File

@ -232,6 +232,16 @@ struct SoftmaxNodeOp : public NaryNodeOp {
};
}
virtual size_t hash() {
if(!hash_) {
hash_ = NaryNodeOp::hash();
if(mask_)
boost::hash_combine(hash_, mask_->hash());
}
return hash_;
}
NodeOps backwardOps() {
// For each row, the Jacobian times vector is given by:
// J * dy = p .* (dy - avg*1)
@ -281,9 +291,12 @@ struct LogSoftmaxNodeOp : public UnaryNodeOp {
};
struct SumNodeOp : public UnaryNodeOp {
int ax_;
template <typename ...Args>
SumNodeOp(Expr a, Args ...args)
: UnaryNodeOp(a, keywords::shape=newShape(a, args...), args...) { }
: UnaryNodeOp(a, keywords::shape=newShape(a, args...), args...),
ax_(keywords::Get(keywords::axis, -1, args...)) { }
NodeOps forwardOps() {
return { NodeOp(Reduce(_1, val_, children_[0]->val())) };
@ -317,19 +330,31 @@ struct SumNodeOp : public UnaryNodeOp {
return "orange";
}
virtual size_t hash() {
if(!hash_) {
hash_ = NaryNodeOp::hash();
boost::hash_combine(hash_, ax_);
}
return hash_;
}
};
struct MeanNodeOp : public UnaryNodeOp {
int ax_;
template <typename ...Args>
MeanNodeOp(Expr a, Args ...args)
: UnaryNodeOp(a, keywords::shape=newShape(a, args...), args...) { }
: UnaryNodeOp(a, keywords::shape=newShape(a, args...), args...),
ax_(keywords::Get(keywords::axis, -1, args...)) { }
NodeOps forwardOps() {
int left = children_[0]->shape().elements() / val_->shape().elements();
float scale = 1.f / left;
return {
NodeOp(Reduce(_1 * scale, val_, children_[0]->val()))
NodeOp(Reduce(_1, val_, children_[0]->val(), scale))
};
}
@ -338,7 +363,7 @@ struct MeanNodeOp : public UnaryNodeOp {
float scale = 1.f / left;
return {
NodeOp(Add(_1 * scale, children_[0]->grad(), adj_))
NodeOp(Add(_1, children_[0]->grad(), adj_, scale))
};
}
@ -365,6 +390,15 @@ struct MeanNodeOp : public UnaryNodeOp {
const std::string color() {
return "orange";
}
virtual size_t hash() {
if(!hash_) {
hash_ = NaryNodeOp::hash();
boost::hash_combine(hash_, ax_);
}
return hash_;
}
};
@ -423,6 +457,78 @@ struct ExpNodeOp : public UnaryNodeOp {
};
struct SqrtNodeOp : public UnaryNodeOp {
float epsilon_;
template <typename ...Args>
SqrtNodeOp(Expr a, float epsilon, Args ...args)
: UnaryNodeOp(a, args...),
epsilon_(epsilon) { }
NodeOps forwardOps() {
return {
NodeOp(Element(_1 = Sqrt(_2 + epsilon_),
val_,
children_[0]->val()))
};
}
NodeOps backwardOps() {
return {
NodeOp(Add(0.5f * (1.f / _1) * _2,
children_[0]->grad(),
val_,
adj_))
};
}
const std::string type() {
return "sqrt";
}
virtual size_t hash() {
if(!hash_) {
size_t seed = NaryNodeOp::hash();
boost::hash_combine(seed, epsilon_);
hash_ = seed;
}
return hash_;
}
};
struct SquareNodeOp : public UnaryNodeOp {
float epsilon_;
template <typename ...Args>
SquareNodeOp(Args ...args)
: UnaryNodeOp(args...) { }
NodeOps forwardOps() {
return {
NodeOp(Element(_1 = _2 * _2,
val_,
children_[0]->val()))
};
}
NodeOps backwardOps() {
return {
NodeOp(Add(2.f * _1 * _2,
children_[0]->grad(),
children_[0]->val(),
adj_))
};
}
const std::string type() {
return "square";
}
};
struct NegNodeOp : public UnaryNodeOp {
template <typename ...Args>
NegNodeOp(Args ...args)
@ -489,6 +595,17 @@ struct RowsNodeOp : public UnaryNodeOp {
return "orange";
}
virtual size_t hash() {
if(!hash_) {
size_t seed = NaryNodeOp::hash();
for(auto i : indeces_)
boost::hash_combine(seed, i);
hash_ = seed;
}
return hash_;
}
std::vector<size_t> indeces_;
};
@ -567,6 +684,17 @@ struct ReshapeNodeOp : public UnaryNodeOp {
const std::string color() {
return "grey";
}
virtual size_t hash() {
if(!hash_) {
size_t seed = NaryNodeOp::hash();
for(auto s : shape())
boost::hash_combine(seed, s);
hash_ = seed;
}
return hash_;
}
};
struct TimestepNodeOp : public UnaryNodeOp {
@ -619,6 +747,15 @@ struct TimestepNodeOp : public UnaryNodeOp {
const std::string color() {
return "grey";
}
virtual size_t hash() {
if(!hash_) {
hash_ = NaryNodeOp::hash();
boost::hash_combine(hash_, step_);
}
return hash_;
}
};
}

54
src/kernels/dropout.cu Normal file
View File

@ -0,0 +1,54 @@
#include <stdio.h>
#include <stdlib.h>
#include "kernels/dropout.h"
#define CUDA_CALL(x) do { if((x)!=cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
exit(1);}} while(0)
#define CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
exit(1);}} while(0)
namespace marian {
curandGenerator_t createCurandGenerator(size_t device,
size_t seed) {
cudaSetDevice(device);
curandGenerator_t generator;
CURAND_CALL(curandCreateGenerator(&generator,
CURAND_RNG_PSEUDO_DEFAULT));
CURAND_CALL(curandSetPseudoRandomGeneratorSeed(generator, seed));
//cudaStream_t stream = 0;
//CURAND_CALL(curandSetStream(generator, stream));
//CURAND_CALL(curandDestroyGenerator(generator));
return generator;
}
__global__
void gScale(float* data, int n, float p) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
while (index < n) {
data[index] = (data[index] < p) / p;
index += gridDim.x * blockDim.x;
}
}
void Dropout(Tensor tensor, float p,
curandGenerator_t gen) {
int n = tensor->size();
CURAND_CALL(curandGenerateUniform(gen, tensor->data(), n));
int numThreads = std::min(n, 512);
int numBlocks = n / numThreads + (n % numThreads != 0);
gScale<<<numBlocks, numThreads>>>(tensor->data(), n, 1.f - p);
}
}

15
src/kernels/dropout.h Normal file
View File

@ -0,0 +1,15 @@
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <curand.h>
#include "tensors/tensor.h"
namespace marian {
curandGenerator_t createCurandGenerator(size_t device, size_t seed=1234);
void Dropout(Tensor tensor, float h,
curandGenerator_t gen);
}

View File

@ -1,70 +0,0 @@
#include "dropout_cudnn.h"
#include "tensors/tensor.h"
namespace marian {
static cudnnHandle_t create_handle_dnn() {
cudnnHandle_t cudnnHandle;
cudnnCreate(&cudnnHandle);
return cudnnHandle;
}
cudnnHandle_t cudnnHandle = create_handle_dnn();
void CudnnDropoutPrepare(Tensor in, float p,
cudnnDropoutDescriptor_t* dropDesc,
void** space, size_t* spaceSize,
void** states, size_t seed) {
size_t statesSize;
cudnnDropoutGetStatesSize(cudnnHandle, &statesSize);
cudnnDropoutGetReserveSpaceSize(in->cudnn(), spaceSize);
cudaMalloc((void**)states, statesSize);
cudaMalloc((void**)space, *spaceSize);
cudnnCreateDropoutDescriptor(dropDesc);
cudnnSetDropoutDescriptor(*dropDesc,
cudnnHandle,
p,
(void*)*states,
statesSize,
seed);
}
void CudnnDropoutDestroy(cudnnDropoutDescriptor_t dropDesc,
void* space, void* states) {
cudnnDestroyDropoutDescriptor(dropDesc);
cudaFree(space);
cudaFree(states);
}
void CudnnDropoutForward(cudnnDropoutDescriptor_t dropoutDesc,
void* space, size_t spaceSize,
Tensor out, Tensor in) {
cudnnDropoutForward(cudnnHandle,
dropoutDesc,
in->cudnn(),
in->data(),
out->cudnn(),
out->data(),
space,
spaceSize);
}
/* void CudnnDropoutBackward(cudnnDropoutDescriptor_t dropoutDesc, */
/* void* space, size_t spaceSize, */
/* Tensor out, Tensor in) { */
/* auto inGpu = static_cast<TensorGPU*>(in.get()); */
/* auto outGpu = static_cast<TensorGPU*>(out.get()); */
/* cudnnDropoutBackward(cudnnHandle, */
/* dropoutDesc, */
/* inGpu->cudnn(), */
/* inGpu->data(), */
/* outGpu->cudnn(), */
/* outGpu->data(), */
/* space, */
/* spaceSize); */
/* } */
}

View File

@ -1,24 +0,0 @@
#pragma once
#include <cudnn.h>
#include "tensors/tensor.h"
namespace marian {
void CudnnDropoutPrepare(Tensor in, float p,
cudnnDropoutDescriptor_t* dropDesc,
void** space, size_t* spaceSize,
void** states, size_t seed);
void CudnnDropoutDestroy(cudnnDropoutDescriptor_t dropDesc,
void* space, void* states);
void CudnnDropoutForward(cudnnDropoutDescriptor_t dropoutDesc,
void* space, size_t spaceSize,
Tensor out, Tensor in);
void CudnnDropoutBackward(cudnnDropoutDescriptor_t dropoutDesc,
void* space, size_t spaceSize,
Tensor out, Tensor in);
}

View File

@ -25,7 +25,7 @@
#include "3rd_party/reduce_all.h"
namespace marian {
cublasHandle_t create_handle(size_t device) {
cudaSetDevice(device);
@ -116,7 +116,7 @@ __global__ void gSoftmax(float* out,
const Shape outShape,
const float* in,
const float* mask) {
int rows = outShape[0];
int rows = outShape[0] * outShape[2] * outShape[3];
int cols = outShape[1];
for(int bid = 0; bid < rows; bid += gridDim.x) {
int j = bid + blockIdx.x;
@ -129,7 +129,7 @@ __global__ void gSoftmax(float* out,
float* _max = _share + blockDim.x;
_max[threadIdx.x] = sp[threadIdx.x]; // mask
for(int tid = 1; tid < cols; tid += blockDim.x) {
for(int tid = 0; tid < cols; tid += blockDim.x) {
int id = tid + threadIdx.x;
if (id < cols) {
if (sp[id] > _max[threadIdx.x])
@ -210,7 +210,7 @@ void Softmax(Tensor out, Tensor in, Tensor mask) {
__global__ void gLogSoftmax(float* out,
const Shape outShape,
const float* in) {
int rows = outShape[0];
int rows = outShape[0] * outShape[2] * outShape[3];
int cols = outShape[1];
for(int bid = 0; bid < rows; bid += gridDim.x) {
int j = bid + blockIdx.x;
@ -222,7 +222,7 @@ __global__ void gLogSoftmax(float* out,
float* _max = _share + blockDim.x;
_max[threadIdx.x] = sp[threadIdx.x];
for(int tid = 1; tid < cols; tid += blockDim.x) {
for(int tid = 0; tid < cols; tid += blockDim.x) {
int id = tid + threadIdx.x;
if (id < cols) {
if (sp[id] > _max[threadIdx.x]) _max[threadIdx.x] = sp[id];
@ -277,8 +277,8 @@ __global__ void gLogSoftmax(float* out,
void LogSoftmax(Tensor out, Tensor in) {
cudaSetDevice(out->getDevice());
size_t m = out->shape()[0];
size_t m = out->shape()[0] * out->shape()[2] * out->shape()[3];
size_t k = out->shape()[1];
int blocks = std::min(MAX_BLOCKS, (int) m);
@ -392,12 +392,12 @@ __global__ void gLogSoftmaxGrad(float* grad, const float* adj, const float* val,
void LogSoftmaxGrad(Tensor grad, Tensor adj, Tensor val) {
cudaSetDevice(adj->getDevice());
// grad and val are both m-by-k matrices, passed as input.
// A weighted average of each row of grad (according to the weights
// specified in val) is computed and subtracted from Out.
// adj is multiplied for each element to get backward step in autodiff
int m = grad->shape()[0];
int m = grad->shape()[0] * grad->shape()[2] * grad->shape()[3];
int k = grad->shape()[1];
int blocks = std::min(MAX_BLOCKS, m);
@ -548,7 +548,7 @@ __global__ void gCopyRows(float* out, const float* in, size_t cols,
void CopyRows(Tensor out, const Tensor in, const std::vector<size_t>& indeces) {
cudaSetDevice(out->getDevice());
size_t cols = in->shape()[1];
size_t rowsToCopy = indeces.size();
@ -589,7 +589,7 @@ __global__ void gPasteRows(float* out, const float* in, size_t cols,
void PasteRows(Tensor out, const Tensor in, const std::vector<size_t>& indeces) {
cudaSetDevice(out->getDevice());
size_t cols = in->shape()[1];
size_t rowsToCopy = indeces.size();
@ -610,19 +610,23 @@ void PasteRows(Tensor out, const Tensor in, const std::vector<size_t>& indeces)
void Transpose(cublasHandle_t cublasHandle, Tensor out, const Tensor in) {
cudaSetDevice(out->getDevice());
size_t m = in->shape()[0];
size_t n = in->shape()[1];
float alpha = 1.0;
float beta = 0.0;
size_t steps = in->shape()[2] * in->shape()[3];
for(int i = 0; i < steps; i++) {
size_t m = in->shape()[0];
size_t n = in->shape()[1];
float alpha = 1.0;
float beta = 0.0;
cublasSgeam(cublasHandle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, &alpha, in->data(), n,
&beta, in->data(), n, out->data(), m);
size_t offset = i * steps;
cublasSgeam(cublasHandle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, &alpha, in->data() + offset, n,
&beta, in->data() + offset, n, out->data() + offset, m);
}
}
void Concatenate0(Tensor out, const std::vector<Tensor>& inputs) {
cudaSetDevice(out->getDevice());
size_t offset = 0;
for(auto in : inputs) {
UTIL_THROW_IF2(out->shape()[1] != in->shape()[1],
@ -658,9 +662,9 @@ __global__ void gInsertCols(float* out, const float* in,
// dimensions, verify this!
void Concatenate1(Tensor out, const std::vector<Tensor>& inputs) {
cudaSetDevice(out->getDevice());
size_t offset = 0;
int rows = out->shape()[0];
int rows = out->shape()[0] * out->shape()[2] * out->shape()[3];
int cols_out = out->shape()[1];
for(auto in : inputs) {
@ -690,7 +694,7 @@ void Concatenate(Tensor out, const std::vector<Tensor>& inputs, int ax) {
void Deconcatenate0(std::vector<Tensor>& outputs, const Tensor in) {
cudaSetDevice(in->getDevice());
size_t offset = 0;
for(auto out : outputs) {
cudaMemcpy(out->data(),
@ -703,9 +707,9 @@ void Deconcatenate0(std::vector<Tensor>& outputs, const Tensor in) {
void Deconcatenate1(std::vector<Tensor>& outputs, const Tensor in) {
cudaSetDevice(in->getDevice());
size_t offset = 0;
int rows = in->shape()[0];
int rows = in->shape()[0] * in->shape()[2] * in->shape()[3];
int cols_in = in->shape()[1];
for(auto out : outputs) {
UTIL_THROW_IF2(out->shape()[0] != in->shape()[0],
@ -778,8 +782,8 @@ __global__ void gGRUFastForward(float* out,
void GRUFastForward(Tensor out, std::vector<Tensor> inputs, bool final){
cudaSetDevice(out->getDevice());
int rows = out->shape()[0];
int rows = out->shape()[0] * out->shape()[2] * out->shape()[3];
int cols = out->shape()[1];
int blocks = std::min(MAX_BLOCKS, rows);
@ -881,10 +885,10 @@ __global__ void gGRUFastBackward(float* outState,
void GRUFastBackward(std::vector<Tensor> outputs,
std::vector<Tensor> inputs,
Tensor adj, bool final) {
cudaSetDevice(adj->getDevice());
int rows = adj->shape()[0];
int rows = adj->shape()[0] * adj->shape()[2] * adj->shape()[3];
int cols = adj->shape()[1];
int blocks = std::min(MAX_BLOCKS, rows);
@ -975,7 +979,7 @@ __global__ void gCrossEntropyPick(float* out,
void CrossEntropyPick(Tensor out, Tensor in, Tensor pick) {
cudaSetDevice(out->getDevice());
size_t m = in->shape()[0];
size_t k = in->shape()[1];
@ -1065,7 +1069,7 @@ __global__ void gCrossEntropyPickBackward(float* out,
void CrossEntropyPickBackward(Tensor out, Tensor adj, Tensor a, Tensor pick) {
cudaSetDevice(out->getDevice());
size_t m = out->shape()[0];
size_t k = out->shape()[1];
@ -1082,7 +1086,7 @@ void CrossEntropyPickBackward(Tensor out, Tensor adj, Tensor a, Tensor pick) {
float L2Norm(Tensor in) {
cudaSetDevice(in->getDevice());
float* data;
cudaMalloc(&data, sizeof(float));
Tensor out(new TensorBase(data, {1, 1}, in->getDevice()));
@ -1094,21 +1098,24 @@ float L2Norm(Tensor in) {
}
__global__ void gAtt(float* out,
const float* in1,
const float* in2,
const float* in3,
int m, // rows
int k, // cols
int n // rows of in2
) {
const float* va,
const float* ctx,
const float* state,
const float* cov,
int m, // total rows (batch x time x beam)
int k, // depth
int b, // batch size
int t // time of ctx
) {
int rows = m;
int cols = k;
for(int bid = 0; bid < m; bid += gridDim.x) {
int j = bid + blockIdx.x;
if(j < rows) {
const float* in1Row = in1 + j * cols;
const float* in2Row = in2 + (j % n) * cols;
const float* in3Row = in3;
const float* vaRow = va;
const float* ctxRow = ctx + (j % (b * t)) * cols;
const float* stateRow = state + (j / (b * t) + j % b) * cols;
const float* covRow = cov ? cov + (j % (b * t)) * cols : nullptr;
extern __shared__ float _share[];
float* _sum = _share + blockDim.x;
@ -1117,7 +1124,10 @@ __global__ void gAtt(float* out,
for(int tid = 0; tid < cols; tid += blockDim.x) {
int id = tid + threadIdx.x;
if(id < cols) {
float ex = tanhf(in1Row[id] + in2Row[id]) * in3Row[id];
float z = ctxRow[id] + stateRow[id];
if(cov)
z += covRow[id];
float ex = tanhf(z) * vaRow[id];
_sum[threadIdx.x] += ex;
}
}
@ -1136,33 +1146,39 @@ __global__ void gAtt(float* out,
}
}
void Att(Tensor out, Tensor context, Tensor state, Tensor va) {
void Att(Tensor out,
Tensor va,
Tensor context,
Tensor state,
Tensor coverage) {
cudaSetDevice(out->getDevice());
size_t m = context->shape()[0] * context->shape()[2] * context->shape()[3];
size_t k = context->shape()[1];
size_t n = context->shape()[0];
size_t m = out->shape()[0] * out->shape()[2] * out->shape()[3];
size_t b = context->shape()[0];
size_t k = context->shape()[1];
size_t t = context->shape()[2];
int blocks = std::min(MAX_BLOCKS, (int) m);
int threads = std::min(MAX_THREADS, (int) k);
int shared = sizeof(float) * threads * 2;
gAtt<<<blocks, threads, shared>>>(out->data(),
va->data(),
context->data(),
state->data(),
va->data(),
m, k, n);
coverage ? coverage->data() : nullptr,
m, k, b, t);
}
__global__ void gAttBack(float* gContext,
__global__ void gAttBack(float* gVa,
float* gContext,
float* gState,
float* gVa,
float* gCoverage,
const float* va,
const float* context,
const float* state,
const float* va,
const float* coverage,
const float* adj,
int m, // rows
int k, // cols
@ -1175,18 +1191,26 @@ __global__ void gAttBack(float* gContext,
if(j < rows) {
float* gcRow = gContext + j * cols;
float* gsRow = gState + (j % n) * cols;
float* gcovRow = gCoverage ? gCoverage + j * cols : nullptr;
const float* cRow = context + j * cols;
const float* sRow = state + (j % n) * cols;
const float* covRow = coverage ? coverage + j * cols : nullptr;
for(int tid = 0; tid < cols; tid += blockDim.x) {
int id = tid + threadIdx.x;
if(id < cols) {
float t = tanhf(cRow[id] + sRow[id]);
float z = cRow[id] + sRow[id];
if(coverage)
z += covRow[id];
float t = tanhf(z);
float r = va[id] * (1.f - t * t);
gcRow[id] += r * adj[j];
gsRow[id] += r * adj[j];
if(gCoverage)
gcovRow[id] += r * adj[j];
atomicAdd(gVa + id, t * adj[j]);
}
}
@ -1195,11 +1219,11 @@ __global__ void gAttBack(float* gContext,
}
void AttBack(Tensor gContext, Tensor gState, Tensor gVa,
Tensor context, Tensor state, Tensor va,
void AttBack(Tensor gVa, Tensor gContext, Tensor gState, Tensor gCoverage,
Tensor va, Tensor context, Tensor state, Tensor coverage,
Tensor adj) {
cudaSetDevice(adj->getDevice());
size_t m = context->shape()[0] * context->shape()[2] * context->shape()[3];
size_t k = context->shape()[1];
@ -1208,16 +1232,208 @@ void AttBack(Tensor gContext, Tensor gState, Tensor gVa,
int blocks = std::min(MAX_BLOCKS, (int) n);
int threads = std::min(MAX_THREADS, (int) k);
gAttBack<<<blocks, threads>>>(gContext->data(),
gAttBack<<<blocks, threads>>>(gVa->data(),
gContext->data(),
gState->data(),
gVa->data(),
gCoverage ? gCoverage->data() : nullptr,
va->data(),
context->data(),
state->data(),
va->data(),
coverage ? coverage->data() : nullptr,
adj->data(),
m, k, n);
}
__global__ void gLNormalization(float* out, const float* in, const float* alpha, const float* beta,
int rows, int cols, float eps=1e-9) {
extern __shared__ float _share[];
for (int bid = 0; bid < rows; bid += gridDim.x) {
int j = bid + blockIdx.x;
if (j < rows) {
float* so = out + j * cols;
const float* sp = in + j * cols;
float* _sum = _share + blockDim.x;
_sum[threadIdx.x] = 0.0f;
for (int tid = 0; tid < cols; tid += blockDim.x) {
int id = tid + threadIdx.x;
if (id < cols) {
_sum[threadIdx.x] += sp[id];
}
}
__syncthreads();
int len = blockDim.x;
while(len != 1) {
__syncthreads();
int skip = (len + 1) >> 1;
if (threadIdx.x < (len >> 1)) {
_sum[threadIdx.x] += _sum[threadIdx.x + skip];
}
len = (len + 1) >> 1;
}
__syncthreads();
float mean = _sum[0] / cols;
__syncthreads();
float* _sqSum = _share + blockDim.x;
_sqSum[threadIdx.x] = 0.0;
for (int tid = 0; tid < cols; tid += blockDim.x) {
int id = tid + threadIdx.x;
if(id < cols) {
float ex = sp[id] - mean;
_sqSum[threadIdx.x] += ex * ex;
}
}
__syncthreads();
len = blockDim.x;
while(len != 1) {
__syncthreads();
int skip = (len + 1) >> 1;
if(threadIdx.x < (len >> 1))
_sqSum[threadIdx.x] += _sqSum[threadIdx.x + skip];
len = (len + 1) >> 1;
}
__syncthreads();
float sigma = sqrtf(eps + (_sqSum[0] / cols));
__syncthreads();
for (int tid = 0; tid < cols; tid += blockDim.x) {
int id = tid + threadIdx.x;
if(id < cols) {
float t = alpha[id] * ((sp[id] - mean) / sigma);
if (beta != nullptr)
t += beta[id];
so[id] = t;
}
}
}
}
}
void LayerNormalization(Tensor out, Tensor in, Tensor gamma, Tensor beta, float eps) {
cudaSetDevice(out->getDevice());
int rows = in->shape()[0] * in->shape()[2] * in->shape()[3];
int cols = in->shape()[1];
int blocks = std::min(MAX_BLOCKS, (int)rows);
int threads = std::min(MAX_THREADS, (int)cols);
int shared = 2 * threads * sizeof(float);
gLNormalization<<<blocks, threads, shared>>>(out->data(),
in->data(),
gamma->data(),
beta ? beta->data() : nullptr,
rows, cols, eps);
}
__global__ void gLayerNormalizationGrad(float* gradX, float* gradGamma, float* gradBeta,
float* adj, float* y, float* x, float* gamma, float* beta,
int rows, int cols, float eps=1e-9) {
extern __shared__ float shared[];
for (int bid = 0; bid < rows; bid += gridDim.x) {
int j = bid + blockIdx.x;
if (j < rows) {
float* sum_adj = shared;
float* sum_adj_x = shared + blockDim.x;
float* sum_x = shared + 2 * blockDim.x;
float* sum_sqr = shared + 3 * blockDim.x;
const float* xRow = x + j * cols;
const float* yRow = y + j * cols;
const float* adjRow = adj + j * cols;
float* gradXRow = gradX + j * cols;
sum_x[threadIdx.x] = 0.0f;
sum_adj[threadIdx.x] = 0.0f;
sum_adj_x[threadIdx.x] = 0.0f;
sum_sqr[threadIdx.x] = 0.0f;
for (int tid = 0; tid < cols; tid += blockDim.x) {
int id = tid + threadIdx.x;
if (id < cols) {
sum_x[threadIdx.x] += xRow[id];
sum_adj_x[threadIdx.x] += adjRow[id] * (yRow[id] - ((beta) ? beta[id] : 0)) / gamma[id];
sum_adj[threadIdx.x] += adjRow[id];
}
}
__syncthreads();
int len = blockDim.x;
while(len != 1) {
__syncthreads();
int skip = (len + 1) >> 1;
if (threadIdx.x < (len >> 1)) {
sum_x[threadIdx.x] += sum_x[threadIdx.x + skip];
sum_adj[threadIdx.x] += sum_adj[threadIdx.x + skip];
sum_adj_x[threadIdx.x] += sum_adj_x[threadIdx.x + skip];
}
len = (len + 1) >> 1;
}
__syncthreads();
float mean = sum_x[0] / cols;
__syncthreads();
for (int tid = 0; tid < cols; tid += blockDim.x) {
int id = tid + threadIdx.x;
if(id < cols) {
float ex = xRow[id] - mean;
sum_sqr[threadIdx.x] += ex * ex;
}
}
__syncthreads();
len = blockDim.x;
while(len != 1) {
__syncthreads();
int skip = (len + 1) >> 1;
if(threadIdx.x < (len >> 1))
sum_sqr[threadIdx.x] += sum_sqr[threadIdx.x + skip];
len = (len + 1) >> 1;
}
__syncthreads();
float sigma = sqrtf(eps + (sum_sqr[0] / cols));
__syncthreads();
for (int tid = 0; tid < cols; tid += blockDim.x) {
int id = tid + threadIdx.x;
if (id < cols) {
float grad_x = 0.0f;
float x_hat = (yRow[id] - ((beta) ? beta[id] : 0) ) / gamma[id];
grad_x += cols * adjRow[id];
grad_x -= sum_adj[0];
grad_x -= sum_adj_x[0] * x_hat;
grad_x /= (cols * sigma);
gradXRow[id] += gamma[id] * grad_x;
atomicAdd(gradGamma + id, adjRow[id] * x_hat);
if (beta) {
atomicAdd(gradBeta + id, adjRow[id]);
}
}
}
}
}
}
void LayerNormalizationGrad(Tensor gradX, Tensor gradGamma, Tensor gradBeta,
Tensor adj, Tensor y, Tensor x, Tensor gamma, Tensor beta) {
cudaSetDevice(adj->getDevice());
int rows = y->shape()[0] * y->shape()[2] * y->shape()[3];
int cols = y->shape()[1];
int threads = std::min(MAX_THREADS, cols);
int blocks = std::min(MAX_BLOCKS, rows);
int shared = sizeof(float) * threads * 4;
gLayerNormalizationGrad<<<blocks, threads, shared>>>
(gradX->data(), gradGamma->data(), (gradBeta) ? gradBeta->data() : nullptr,
adj->data(), y->data(), x->data(), gamma->data(),(beta) ? beta->data() : nullptr, rows, cols);
}
} // namespace marian

View File

@ -45,8 +45,9 @@ __global__ void gAdd(Functor functor,
Shape outShape,
const float* in1,
const Shape in1Shape,
const Shape full) {
const Shape full,
float scale = 1.0) {
int outLength = outShape.elements();
bool same = outLength == full.elements() && outLength == in1Shape.elements();
@ -61,7 +62,7 @@ __global__ void gAdd(Functor functor,
int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
if(index < outLength) {
if(same) {
out[index] += functor(in1[index]);
out[index] += functor(in1[index]) * scale;
}
else {
outShape.dims(index, dims);
@ -83,15 +84,75 @@ __global__ void gAdd(Functor functor,
}
}
if(sum)
out[index] += sum;
out[index] += sum * scale;
}
}
}
}
template <class Functor>
__global__ void gAdd1(Functor functor,
float* out,
Shape outShape,
const float* in1,
const Shape in1Shape,
const Shape full,
float scale = 1.0) {
int rows = full[0] * full[2] * full[3];
int cols = full[1];
bool same = in1Shape.elements() == full.elements();
for(int bid = 0; bid < rows; bid += gridDim.x) {
int j = bid + blockIdx.x;
if(j < rows) {
extern __shared__ float _share[];
float* _sum = _share + blockDim.x;
if(same) {
const float* sp = in1 + j * cols;
_sum[threadIdx.x] = 0;
for(int tid = 0; tid < cols; tid += blockDim.x) {
int id = tid + threadIdx.x;
if (id < cols) {
_sum[threadIdx.x] += functor(sp[id]);
}
}
}
else {
int dims[4];
_sum[threadIdx.x] = 0;
for(int tid = 0; tid < cols; tid += blockDim.x) {
int id = tid + threadIdx.x;
if (id < cols) {
full.dims(j * cols + id, dims);
int in1Index = in1Shape.bindex(dims);
_sum[threadIdx.x] += functor(in1[in1Index]);
}
}
}
__syncthreads();
int len = blockDim.x;
while(len != 1) {
__syncthreads();
int skip = (len + 1) >> 1;
if (threadIdx.x < (len >> 1)) {
_sum[threadIdx.x] += _sum[threadIdx.x + skip];
}
len = (len + 1) >> 1;
}
__syncthreads();
out[j] += _sum[0] * scale;
}
}
}
template <class Functor>
void Add(Functor functor,
Tensor out, Tensor in) {
Tensor out, Tensor in, float scale = 1.0) {
cudaSetDevice(out->getDevice());
@ -101,20 +162,36 @@ void Add(Functor functor,
int length = out->shape().elements();
int threads = std::min(MAX_THREADS, length);
int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
if(full.elements() / length == full[1]) {
size_t m = full.elements() / length;
size_t k = full[1];
gAdd<<<blocks, threads>>>(functor,
out->data(), out->shape(),
in->data(), in->shape(),
full);
int blocks = std::min(MAX_BLOCKS, (int) m);
int threads = std::min(MAX_THREADS, (int) k);
int shared = sizeof(float) * threads * 2;
gAdd1<<<blocks, threads, shared>>>(functor,
out->data(), out->shape(),
in->data(), in->shape(),
full, scale);
}
else {
int threads = std::min(MAX_THREADS, length);
int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
gAdd<<<blocks, threads>>>(functor,
out->data(), out->shape(),
in->data(), in->shape(),
full, scale);
}
}
template <class Functor, class T1, class T2>
void Reduce(Functor functor,
T1 out, T2 in) {
T1 out, T2 in, float scale = 1.0) {
out->set(0);
Add(functor, out, in);
Add(functor, out, in, scale);
}
template <class Functor>
@ -125,7 +202,8 @@ __global__ void gAdd(Functor functor,
const Shape in1Shape,
const float* in2,
const Shape in2Shape,
const Shape full) {
const Shape full,
float scale = 1.0) {
int outLength = outShape.elements();
@ -144,7 +222,7 @@ __global__ void gAdd(Functor functor,
int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
if (index < outLength) {
if(same) {
out[index] += functor(in1[index], in2[index]);
out[index] += functor(in1[index], in2[index]) * scale;
}
else {
outShape.dims(index, dims);
@ -166,15 +244,80 @@ __global__ void gAdd(Functor functor,
}
}
if(sum)
out[index] += sum;
out[index] += sum * scale;
}
}
}
}
template <class Functor>
__global__ void gAdd1(Functor functor,
float* out,
Shape outShape,
const float* in1,
const Shape in1Shape,
const float* in2,
const Shape in2Shape,
const Shape full,
float scale = 1.0) {
int rows = full[0] * full[2] * full[3];
int cols = full[1];
bool same = in1Shape.elements() == full.elements()
&& in2Shape.elements() == full.elements();
for(int bid = 0; bid < rows; bid += gridDim.x) {
int j = bid + blockIdx.x;
if(j < rows) {
extern __shared__ float _share[];
float* _sum = _share + blockDim.x;
if(same) {
const float* sp1 = in1 + j * cols;
const float* sp2 = in2 + j * cols;
_sum[threadIdx.x] = 0;
for(int tid = 0; tid < cols; tid += blockDim.x) {
int id = tid + threadIdx.x;
if (id < cols) {
_sum[threadIdx.x] += functor(sp1[id], sp2[id]);
}
}
}
else {
int dims[4];
_sum[threadIdx.x] = 0;
for(int tid = 0; tid < cols; tid += blockDim.x) {
int id = tid + threadIdx.x;
if (id < cols) {
full.dims(j * cols + id, dims);
int in1Index = in1Shape.bindex(dims);
int in2Index = in2Shape.bindex(dims);
_sum[threadIdx.x] += functor(in1[in1Index], in2[in2Index]);
}
}
}
__syncthreads();
int len = blockDim.x;
while(len != 1) {
__syncthreads();
int skip = (len + 1) >> 1;
if (threadIdx.x < (len >> 1)) {
_sum[threadIdx.x] += _sum[threadIdx.x + skip];
}
len = (len + 1) >> 1;
}
__syncthreads();
out[j] += _sum[0] * scale;
}
}
}
template <class Functor>
void Add(Functor functor,
Tensor out, Tensor in1, Tensor in2) {
Tensor out, Tensor in1, Tensor in2, float scale = 1.0) {
cudaSetDevice(out->getDevice());
@ -186,23 +329,39 @@ void Add(Functor functor,
int length = out->shape().elements();
int threads = std::min(MAX_THREADS, length);
int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
/*
if(full.elements() / length == full[1]) {
size_t m = full.elements() / length;
size_t k = full[1];
gAdd<<<blocks, threads>>>(functor,
out->data(), out->shape(),
in1->data(), in1->shape(),
in2->data(), in2->shape(),
full);
int blocks = std::min(MAX_BLOCKS, (int) m);
int threads = std::min(MAX_THREADS, (int) k);
int shared = sizeof(float) * threads * 2;
gAdd1<<<blocks, threads, shared>>>(functor,
out->data(), out->shape(),
in1->data(), in1->shape(),
in2->data(), in2->shape(),
full);
}
else {*/
int threads = std::min(MAX_THREADS, length);
int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
gAdd<<<blocks, threads>>>(functor,
out->data(), out->shape(),
in1->data(), in1->shape(),
in2->data(), in2->shape(),
full, scale);
//}
}
template <class Functor>
void Reduce(Functor functor,
Tensor out, Tensor in1, Tensor in2) {
Tensor out, Tensor in1, Tensor in2, float scale = 1.0) {
out->set(0);
Add(functor, out, in1, in2);
Add(functor, out, in1, in2, scale);
}
@ -680,9 +839,13 @@ void GRUFastBackward(std::vector<Tensor> outputs,
std::vector<Tensor> inputs,
Tensor adj, bool final = false);
void Att(Tensor out, Tensor context, Tensor state, Tensor va);
void AttBack(Tensor gContext, Tensor gState, Tensor gva,
Tensor context, Tensor state, Tensor va,
void Att(Tensor out, Tensor va, Tensor context, Tensor state, Tensor coverage);
void AttBack(Tensor gva, Tensor gContext, Tensor gState, Tensor gCoverage,
Tensor va, Tensor context, Tensor state, Tensor coverage,
Tensor adj);
void LayerNormalization(Tensor out, Tensor in, Tensor gamma, Tensor beta, float eps=1e-9);
void LayerNormalizationGrad(Tensor gradX, Tensor gradGamma, Tensor gradBeta,
Tensor adj, Tensor y, Tensor x, Tensor gamma, Tensor beta);
}

View File

@ -48,6 +48,7 @@ namespace thrust
return compose(unary_operator<unary_exp>(), _1);
}
template<typename T>
struct unary_log : public thrust::unary_function<T,T> {
__host__ __device__
@ -166,6 +167,33 @@ namespace thrust
make_actor(_1),
make_actor(_2));
}
template<typename T>
struct binary_pow : public thrust::binary_function<T, T, T> {
__host__ __device__
T operator()(const T &x, const T &y) const {
float tx = x;
if(y == (int)y && (int)y % 2 == 0)
tx = abs(x);
return powf(tx, y);
}
};
template<typename T1, typename T2>
__host__ __device__
actor<
composite<
binary_operator<binary_pow>,
actor<T1>,
typename as_actor<T2>::type
>
>
Pow(const actor<T1> &_1, const T2 &_2)
{
return compose(binary_operator<binary_pow>(),
make_actor(_1),
make_actor(_2));
}
}
}
}

View File

@ -2,6 +2,7 @@
#include "marian.h"
#include "graph/expression_graph.h"
#include "layers/rnn.h"
namespace marian {
@ -13,17 +14,19 @@ struct AttentionNodeOp : public NaryNodeOp {
keywords::shape=newShape(nodes)) {}
Shape newShape(const std::vector<Expr>& nodes) {
Shape shape = nodes[0]->shape();
Shape shape2 = nodes[1]->shape();
Shape shape3 = nodes[2]->shape();
Shape shape = nodes[1]->shape();
for(int i = 0; i < shape2.size(); ++i) {
UTIL_THROW_IF2(shape[i] != shape2[i] && shape[i] != 1 && shape2[i] != 1,
Shape vaShape = nodes[0]->shape();
Shape ctxShape = nodes[1]->shape();
Shape stateShape = nodes[2]->shape();
for(int i = 0; i < stateShape.size(); ++i) {
UTIL_THROW_IF2(ctxShape[i] != stateShape[i] && ctxShape[i] != 1 && stateShape[i] != 1,
"Shapes cannot be broadcasted");
shape.set(i, std::max(shape[i], shape2[i]));
shape.set(i, std::max(ctxShape[i], stateShape[i]));
}
UTIL_THROW_IF2(shape3[0] != shape[1] || shape3[1] != 1,
UTIL_THROW_IF2(vaShape[0] != shape[1] || vaShape[1] != 1,
"Wrong size");
shape.set(1, 1);
@ -35,7 +38,8 @@ struct AttentionNodeOp : public NaryNodeOp {
NodeOp(Att(val_,
children_[0]->val(),
children_[1]->val(),
children_[2]->val()))
children_[2]->val(),
children_.size() == 4 ? children_[3]->val() : nullptr))
};
}
@ -46,9 +50,11 @@ struct AttentionNodeOp : public NaryNodeOp {
children_[0]->grad(),
children_[1]->grad(),
children_[2]->grad(),
children_.size() == 4 ? children_[3]->grad() : nullptr,
children_[0]->val(),
children_[1]->val(),
children_[2]->val(),
children_.size() == 4 ? children_[3]->val() : nullptr,
adj_
);
)
@ -70,22 +76,33 @@ struct AttentionNodeOp : public NaryNodeOp {
}
};
Expr attOps(Expr context, Expr state, Expr va) {
std::vector<Expr> nodes{context, state, va};
Expr attOps(Expr va, Expr context, Expr state, Expr coverage=nullptr) {
std::vector<Expr> nodes{va, context, state};
if(coverage)
nodes.push_back(coverage);
int dimBatch = context->shape()[0];
int dimWords = context->shape()[2];
int dimBeam = state->shape()[3];
return reshape(Expression<AttentionNodeOp>(nodes),
{dimWords, dimBatch});
{dimWords, dimBatch, 1, dimBeam});
}
class GlobalAttention {
private:
Expr Wa_, ba_, Ua_, va_;
Expr gammaContext_, betaContext_;
Expr gammaState_, betaState_;
Expr context_;
Expr softmaxMask_;
Expr mappedContext_;
std::vector<Expr> contexts_;
std::vector<Expr> alignments_;
bool layerNorm_;
Expr cov_;
public:
@ -95,21 +112,34 @@ class GlobalAttention {
int dimDecState,
Args ...args)
: context_(context),
softmaxMask_(nullptr) {
softmaxMask_(nullptr),
layerNorm_(Get(keywords::normalize, false, args...)),
cov_(Get(keywords::coverage, nullptr, args...)) {
int dimEncState = context->shape()[1];
auto graph = context->graph();
Wa_ = graph->param(prefix + "_W_comb_att", {dimDecState, dimEncState},
keywords::init=inits::glorot_uniform);
ba_ = graph->param(prefix + "_b_att", {1, dimEncState},
keywords::init=inits::zeros);
Ua_ = graph->param(prefix + "_Wc_att", {dimEncState, dimEncState},
keywords::init=inits::glorot_uniform);
va_ = graph->param(prefix + "_U_att", {dimEncState, 1},
keywords::init=inits::glorot_uniform);
ba_ = graph->param(prefix + "_b_att", {1, dimEncState},
keywords::init=inits::zeros);
mappedContext_ = affine(context_, Ua_, ba_);
if(layerNorm_) {
gammaContext_ = graph->param(prefix + "_att_gamma1", {1, dimEncState},
keywords::init=inits::from_value(1.0));
gammaState_ = graph->param(prefix + "_att_gamma2", {1, dimEncState},
keywords::init=inits::from_value(1.0));
mappedContext_ = layer_norm(dot(context_, Ua_), gammaContext_, ba_);
}
else {
mappedContext_ = affine(context_, Ua_, ba_);
}
auto softmaxMask = Get(keywords::mask, nullptr, args...);
if(softmaxMask) {
@ -124,21 +154,23 @@ class GlobalAttention {
int dimBatch = context_->shape()[0];
int srcWords = context_->shape()[2];
int dimBeam = state->shape()[3];
auto mappedState = dot(state, Wa_);
auto attReduce = attOps(mappedContext_, mappedState, va_);
if(layerNorm_)
mappedState = layer_norm(mappedState, gammaState_);
auto attReduce = attOps(va_, mappedContext_, mappedState);
// @TODO: horrible ->
auto e = reshape(
transpose(softmax(transpose(attReduce),
softmaxMask_)),
{dimBatch, 1, srcWords});
auto e = reshape(transpose(softmax(transpose(attReduce), softmaxMask_)),
{dimBatch, 1, srcWords, dimBeam});
// <- horrible
auto alignedSource = weighted_average(context_, e,
axis=2);
auto alignedSource = weighted_average(context_, e, axis=2);
contexts_.push_back(alignedSource);
alignments_.push_back(e);
return alignedSource;
}

View File

@ -1,60 +0,0 @@
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <curand.h>
#include "tensors/tensor.h"
#define CUDA_CALL(x) do { if((x)!=cudaSuccess) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
exit(1);}} while(0)
#define CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
printf("Error at %s:%d\n",__FILE__,__LINE__);\
exit(1);}} while(0)
__global__
void gScalled(float* data, int n, float p) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
while (index < n) {
data[index] = (data[index] < p) / p;
index += gridDim.x * blockDim.x;
}
}
namespace marian {
class DropoutGenerator {
public:
DropoutGenerator(cudaStream_t stream=0, unsigned long long seed = 1234ULL) {
CURAND_CALL(curandCreateGenerator(&generator_, CURAND_RNG_PSEUDO_DEFAULT));
CURAND_CALL(curandSetPseudoRandomGeneratorSeed(generator_, seed));
CURAND_CALL(curandSetStream(generator_, stream));
}
void Generate(Tensor& tensor, float p) {
Generate(tensor->data(), tensor->size(), p);
}
void Generate(float* data, int n, float p) {
CURAND_CALL(curandGenerateUniform(generator_, data, n));
int numThreads = std::min(n, 512);
int numBlocks = n / numThreads + (n % numThreads != 0);
gScalled<<<numBlocks, numThreads>>>(data, n, p);
}
~DropoutGenerator() {
CURAND_CALL(curandDestroyGenerator(generator_));
}
private:
curandGenerator_t generator_;
};
}

View File

@ -28,6 +28,7 @@ namespace marian {
private:
int outDim_;
act activation_;
bool layerNorm_;
public:
template <class ...Args>
@ -38,18 +39,30 @@ namespace marian {
outDim_(outDim),
activation_(Get(keywords::activation,
act::linear,
args...)) {}
args...)),
layerNorm_(Get(keywords::normalize,
false, args...)) {}
Expr operator()(Expr in) {
auto g = in->graph();
auto W = g->param(name_ + "_W", {in->shape()[1], outDim_},
keywords::init=inits::glorot_uniform);
auto b = g->param(name_ + "_b", {1, outDim_},
keywords::init=inits::zeros);
keywords::init=inits::zeros);
params_ = { W, b };
auto out = affine(in, W, b);
Expr out;
if(layerNorm_) {
auto gamma = g->param(name_ + "_gamma", {1, outDim_},
keywords::init=inits::from_value(1.0));
params_.push_back(gamma);
out = layer_norm(dot(in, W), gamma, b);
}
else {
out = affine(in, W, b);
}
switch (activation_) {
case act::linear :
@ -81,13 +94,21 @@ namespace marian {
{in->shape()[1], outDim_},
keywords::init=inits::glorot_uniform);
auto b = g->param(name_ + "_b" + std::to_string(i),
{1, outDim_},
keywords::init=inits::zeros);
{1, outDim_},
keywords::init=inits::zeros);
params_.push_back(W);
params_.push_back(b);
outputs.push_back(affine(in, W, b));
if(layerNorm_) {
auto gamma = g->param(name_ + "_gamma" + std::to_string(i), {1, outDim_},
keywords::init=inits::from_value(1.0));
params_.push_back(gamma);
outputs.push_back(layer_norm(dot(in, W), gamma, b));
}
else {
outputs.push_back(affine(in, W, b));
}
i++;
}
@ -145,7 +166,7 @@ namespace marian {
auto mask = Get(keywords::mask, nullptr, args...);
auto ce = cross_entropy(in, picks);
if(mask)
ce = ce * mask;

View File

@ -1,23 +1,3 @@
// This file is part of the Marian toolkit.
// Marian is copyright (c) 2016 Marcin Junczys-Dowmunt.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include <random>
#include <algorithm>
@ -44,9 +24,6 @@ float xor128() {
return 0.1 * ((w % 1000) / 1000.f) - 0.05;
}
// Use a constant seed for deterministic behaviour.
//std::default_random_engine engine(42);
void zeros(Tensor t) {
t->set(0.f);
}

View File

@ -1,26 +1,5 @@
#pragma once
// This file is part of the Marian toolkit.
// Marian is copyright (c) 2016 Marcin Junczys-Dowmunt.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
#include <functional>
#include <random>
#include "tensors/tensor.h"
@ -47,6 +26,7 @@ template <class Distribution>
void distribution(std::vector<float>& vals, float a, float b) {
std::random_device device;
std::default_random_engine engine(device());
engine.seed(1234);
Distribution dist(a, b);
auto gen = std::bind(dist, engine);

View File

@ -1,3 +1,5 @@
#pragma once
#include <algorithm>
#include <chrono>
#include <iomanip>
@ -10,7 +12,6 @@
#include "graph/expression_graph.h"
#include "layers/generic.h"
#include "layers/attention.h"
namespace marian {
@ -56,6 +57,7 @@ class Tanh {
template <class Cell>
class RNN : public Layer {
public:
int dimInput_;
int dimState_;
dir direction_;
bool outputLast_;
@ -63,15 +65,17 @@ class RNN : public Layer {
Ptr<Cell> cell_;
template <typename ...Args>
RNN(const std::string& name,
int dimState,
Cell cell,
Args ...args)
RNN(Ptr<ExpressionGraph> graph,
const std::string& name,
int dimInput,
int dimState,
Args ...args)
: Layer(name),
dimInput_{dimInput},
dimState_{dimState},
direction_{Get(keywords::direction, dir::forward, args...)},
outputLast_{Get(keywords::output_last, false, args...)},
cell_(New<Cell>(cell)) {}
cell_(New<Cell>(graph, name_, dimInput_, dimState_, args...)) {}
Ptr<Cell> getCell() {
return cell_;
@ -80,7 +84,6 @@ class RNN : public Layer {
std::vector<Expr> apply(const Expr input, const Expr initialState,
const Expr mask = nullptr, bool reverse = false) {
auto xW = cell_->apply1(input);
std::vector<Expr> outputs;
auto state = initialState;
for(size_t i = 0; i < input->shape()[2]; ++i) {
@ -114,13 +117,11 @@ class RNN : public Layer {
auto graph = input->graph();
int dimInput = input->shape()[1];
cell_->initialize(graph, name_, dimInput, dimState_, args...);
Expr mask = Get(keywords::mask, nullptr, args...);
if(direction_ == dir::backward) {
auto states = apply(input, state, mask, true);
//std::reverse(states.begin(), states.end());
std::reverse(states.begin(), states.end());
if(outputLast_)
return states.back();
else
@ -139,63 +140,130 @@ class RNN : public Layer {
}
};
template <class Cell>
class MLRNN : public Layer {
private:
int layers_;
bool skip_;
bool skipFirst_;
int dimState_;
std::vector<Ptr<RNN<Cell>>> rnns_;
public:
template <typename ...Args>
MLRNN(Ptr<ExpressionGraph> graph,
const std::string& name,
int layers,
int dimInput,
int dimState,
Args ...args)
: Layer(name),
layers_(layers),
skip_(Get(keywords::skip, false, args...)),
skipFirst_(Get(keywords::skip_first, false, args...)),
dimState_{dimState} {
for(int i = 0; i < layers; ++i) {
rnns_.push_back(
New<RNN<Cell>>(graph,
name + "_l" + std::to_string(i),
i == 0 ? dimInput : dimState,
dimState,
args...)
);
}
}
template <typename ...Args>
std::tuple<Expr, std::vector<Expr>>
operator()(Expr input, Args ...args) {
Expr output;
std::vector<Expr> outStates;
for(int i = 0; i < layers_; ++i) {
auto outState = (*rnns_[i])(input, args...);
outStates.push_back(outState);
if(skip_ && (skipFirst_ || i > 0))
output = outState + input;
else
output = outState;
input = output;
}
return std::make_tuple(output, outStates);
}
template <typename ...Args>
std::tuple<Expr, std::vector<Expr>>
operator()(Expr input,
std::vector<Expr> states,
Args ...args) {
Expr output;
std::vector<Expr> outStates;
for(int i = 0; i < layers_; ++i) {
auto outState = (*rnns_[i])(input, states[i], args...);
outStates.push_back(outState);
if(skip_ && (skipFirst_ || i > 0))
output = outState + input;
else
output = outState;
input = output;
}
return std::make_tuple(output, outStates);
}
};
template <class Cell>
class BiRNN : public Layer {
public:
int layers_;
int dimState_;
Ptr<RNN<Cell>> rnn1_;
Ptr<RNN<Cell>> rnn2_;
template <typename ...Args>
BiRNN(const std::string& name,
int dimState,
Cell cell1,
Cell cell2,
Args ...args)
BiRNN(Ptr<ExpressionGraph> graph,
const std::string& name,
int layers,
int dimInput,
int dimState,
Args ...args)
: Layer(name),
dimState_{dimState},
rnn1_(New<RNN<Cell>>(name, dimState, cell1,
keywords::direction=dir::forward,
args...)),
rnn2_(New<RNN<Cell>>(name + "_r", dimState, cell2,
keywords::direction=dir::backward,
args...)) {}
rnn1_(New<MLRNN<Cell>>(graph, name, layers, dimInput, dimState,
keywords::direction=dir::forward,
args...)),
rnn2_(New<MLRNN<Cell>>(graph, name + "_r", layers, dimInput, dimState,
keywords::direction=dir::backward,
args...)) {}
template <typename ...Args>
BiRNN(const std::string& name,
int dimState,
Args ...args)
: BiRNN(name, dimState, Cell(), Cell(), args...) {}
std::vector<Expr> operator()(Expr input, Args ...args) {
Expr mask = Get(keywords::mask, nullptr, args...);
auto statesfw = (*rnn1_)(input);
auto statesbw = (*rnn2_)(input, keywords::mask=mask);
template <typename ...Args>
Expr operator()(Expr input, Args ...args) {
auto graph = input->graph();
int dimBatch = input->shape()[0];
auto startState = graph->zeros(keywords::shape={dimBatch, dimState_});
return (*this)(input, startState, args...);
std::vector<Expr> outStates;
for(int i = 0; i < layers_; ++i)
outStates.push_back(concatenate({statesfw[i], statesbw[i]},
keywords::axis=1));
return outStates;
}
template <typename ...Args>
Expr operator()(Expr input, Expr state, Args ...args) {
std::vector<Expr> operator()(Expr input, std::vector<Expr> states, Args ...args) {
Expr mask = Get(keywords::mask, nullptr, args...);
auto statesfw = (*rnn1_)(input, states);
auto statesbw = (*rnn2_)(input, states, keywords::mask=mask);
auto graph = input->graph();
int dimInput = input->shape()[1];
rnn1_->getCell()->initialize(graph, name_, dimInput, dimState_, args...);
auto states1 = rnn1_->apply(input, state, nullptr);
rnn2_->getCell()->initialize(graph, name_ + "_r", dimInput, dimState_, args...);
auto states2 = rnn2_->apply(input, state, mask, true);
std::reverse(states2.begin(), states2.end());
std::vector<Expr> states;
for(int i = 0; i < states1.size(); ++i)
states.push_back(concatenate({states1[i], states2[i]},
keywords::axis=1));
return concatenate(states, keywords::axis=2);
std::vector<Expr> outStates;
for(int i = 0; i < layers_; ++i)
outStates.push_back(concatenate({statesfw[i], statesbw[i]},
keywords::axis=1));
return outStates;
}
};
@ -255,21 +323,32 @@ Expr gruOps(const std::vector<Expr>& nodes, bool final = false) {
return Expression<GRUFastNodeOp>(nodes, final);
}
/***************************************************************/
class GRU {
private:
std::string prefix_;
Expr U_, W_, b_;
Expr gamma1_;
Expr gamma2_;
bool final_;
bool layerNorm_;
float dropout_;
Expr dropMaskX_;
Expr dropMaskS_;
public:
GRU() {}
template <typename ...Args>
void initialize(
ExpressionGraphPtr graph,
GRU(ExpressionGraphPtr graph,
const std::string prefix,
int dimInput,
int dimState,
Args ...args) {
Args ...args) : prefix_(prefix) {
auto U = graph->param(prefix + "_U", {dimState, 2 * dimState},
keywords::init=inits::glorot_uniform);
auto W = graph->param(prefix + "_W", {dimInput, 2 * dimState},
@ -288,19 +367,49 @@ class GRU {
b_ = concatenate({b, bx}, keywords::axis=1);
final_ = Get(keywords::final, false, args...);
layerNorm_ = Get(keywords::normalize, false, args...);
dropout_ = Get(keywords::dropout_prob, 0.0f, args...);
if(layerNorm_) {
gamma1_ = graph->param(prefix + "_gamma1", {1, 3 * dimState},
keywords::init=inits::from_value(1.f));
gamma2_ = graph->param(prefix + "_gamma2", {1, 3 * dimState},
keywords::init=inits::from_value(1.f));
}
if(dropout_> 0.0f) {
dropMaskX_ = graph->dropout(dropout_, {1, dimInput});
dropMaskS_ = graph->dropout(dropout_, {1, dimState});
}
}
Expr apply(Expr input, Expr state, Expr mask = nullptr) {
Expr apply(Expr input, Expr state,
Expr mask = nullptr) {
return apply2(apply1(input), state, mask);
}
Expr apply1(Expr input) {
if(dropMaskX_)
input = dropout(input, keywords::mask=dropMaskX_);
debug(input, "in");
auto xW = dot(input, W_);
if(layerNorm_)
xW = layer_norm(xW, gamma1_);
return xW;
}
Expr apply2(Expr xW, Expr state, Expr mask = nullptr) {
Expr apply2(Expr xW, Expr state,
Expr mask = nullptr) {
if(dropMaskS_)
state = dropout(state, keywords::mask=dropMaskS_);
debug(state, "state");
auto sU = dot(state, U_);
if(layerNorm_)
sU = layer_norm(sU, gamma2_);
auto output = mask ?
gruOps({state, xW, sU, b_, mask}, final_) :
gruOps({state, xW, sU, b_}, final_);
@ -309,6 +418,7 @@ class GRU {
}
};
/***************************************************************/
template <class Cell1, class Attention, class Cell2>
@ -320,31 +430,29 @@ class AttentionCell {
public:
AttentionCell(Attention&& att)
: cell1_(New<Cell1>()),
cell2_(New<Cell2>()),
att_(New<Attention>(att)) {}
template <typename ...Args>
void initialize(Ptr<ExpressionGraph> graph,
const std::string prefix,
int dimInput,
int dimState,
Args ...args)
template <class ...Args>
AttentionCell(Ptr<ExpressionGraph> graph,
const std::string prefix,
int dimInput,
int dimState,
Ptr<Attention> att,
Args ...args)
{
cell1_->initialize(graph,
prefix + "_cell1",
dimInput,
dimState,
keywords::final=false,
args...);
cell1_ = New<Cell1>(graph,
prefix + "_cell1",
dimInput,
dimState,
keywords::final=false,
args...);
cell2_->initialize(graph,
prefix + "_cell2",
att_->outputDim(),
dimState,
keywords::final=true,
args...);
att_ = New<Attention>(att);
cell2_ = New<Cell2>(graph,
prefix + "_cell2",
att_->outputDim(),
dimState,
keywords::final=true,
args...);
}
Expr apply(Expr input, Expr state, Expr mask = nullptr) {
@ -361,11 +469,17 @@ class AttentionCell {
return cell2_->apply(alignedSourceContext, hidden, mask);
}
Ptr<Attention> getAttention() {
return att_;
}
Expr getContexts() {
return concatenate(att_->getContexts(), keywords::axis=2);
}
Expr getLastContext() {
return att_->getContexts().back();
}
};
typedef AttentionCell<GRU, GlobalAttention, GRU> CGRU;
}

View File

@ -25,3 +25,5 @@
#include "graph/expression_graph.h"
#include "graph/expression_operators.h"
#include "layers/param_initializers.h"
#include "training/training.h"
#include "training/graph_group.h"

View File

@ -1,7 +1,7 @@
#pragma once
#include "data/corpus.h"
#include "command/config.h"
#include "training/config.h"
#include "graph/expression_graph.h"
#include "layers/rnn.h"
#include "layers/param_initializers.h"
@ -11,10 +11,12 @@
namespace marian {
class Nematus : public ExpressionGraph {
class DL4MT {
private:
Ptr<Config> options_;
Ptr<RNN<CGRU>> rnn_;
int dimSrcVoc_{40000};
int dimSrcEmb_{512};
int dimEncState_{1024};
@ -25,6 +27,8 @@ class Nematus : public ExpressionGraph {
int dimBatch_{64};
bool normalize_;
void setDims(Ptr<ExpressionGraph> graph,
Ptr<data::CorpusBatch> batch) {
dimSrcVoc_ = graph->get("Wemb") ? graph->get("Wemb")->shape()[0] : dimSrcVoc_;
@ -39,14 +43,14 @@ class Nematus : public ExpressionGraph {
}
public:
Nematus() {}
Nematus(Ptr<Config> options)
DL4MT(Ptr<Config> options)
: options_(options) {
auto dimVocabs = options->get<std::vector<int>>("dim-vocabs");
normalize_ = options->get<bool>("normalize");
dimSrcVoc_ = dimVocabs[0];
dimSrcEmb_ = options->get<int>("dim-emb");
dimEncState_ = options->get<int>("dim-rnn");
@ -56,16 +60,16 @@ class Nematus : public ExpressionGraph {
dimBatch_ = options->get<int>("mini-batch");
}
void load(Ptr<ExpressionGraph> graph,
const std::string& name) {
using namespace keywords;
LOG(info) << "Loading model from " << name;
auto numpy = cnpy::npz_load(name);
auto parameters = {
std::vector<std::string> parameters = {
// Source word embeddings
"Wemb",
@ -102,6 +106,20 @@ class Nematus : public ExpressionGraph {
"ff_logit_W", "ff_logit_b",
};
std::vector<std::string> parametersNorm = {
"decoder_att_gamma1", "decoder_att_gamma2",
"decoder_cell1_gamma1", "decoder_cell1_gamma2",
"decoder_cell2_gamma1", "decoder_cell2_gamma2",
"encoder_gamma1", "encoder_gamma2",
"encoder_r_gamma1", "encoder_r_gamma2",
"ff_logit_l1_gamma0", "ff_logit_l1_gamma1",
"ff_logit_l1_gamma2", "ff_state_gamma"
};
if(normalize_)
for(auto& p : parametersNorm)
parameters.push_back(p);
std::map<std::string, std::string> nameMap = {
{"decoder_U", "decoder_cell1_U"},
{"decoder_W", "decoder_cell1_W"},
@ -129,6 +147,9 @@ class Nematus : public ExpressionGraph {
};
for(auto name : parameters) {
UTIL_THROW_IF2(numpy.count(name) == 0,
"Parameter " << name << " does not exist.");
Shape shape;
if(numpy[name].shape.size() == 2) {
shape.set(0, numpy[name].shape[0]);
@ -152,7 +173,7 @@ class Nematus : public ExpressionGraph {
const std::string& name) {
LOG(info) << "Saving model to " << name;
unsigned shape[2];
std::string mode = "w";
@ -274,54 +295,148 @@ class Nematus : public ExpressionGraph {
return std::make_tuple(y, yMask, yIdx);
}
std::tuple<Expr, Expr> encoder(Ptr<ExpressionGraph> graph,
Ptr<data::CorpusBatch> batch) {
using namespace keywords;
auto xEmb = Embedding("Wemb", dimSrcVoc_, dimSrcEmb_)(graph);
Expr x, xMask;
std::tie(x, xMask) = prepareSource(xEmb, batch, 0);
auto xfw = RNN<GRU>(graph, "encoder",
dimSrcEmb_, dimEncState_,
normalize=normalize_,
direction=dir::forward)(x);
auto xbw = RNN<GRU>(graph, "encoder_r",
dimSrcEmb_, dimEncState_,
normalize=normalize_,
direction=dir::backward)(x, mask=xMask);
auto xContext = concatenate({xfw, xbw}, axis=1);
return std::make_tuple(xContext, xMask);
}
std::tuple<Expr, Expr> step(Expr hyps,
const std::vector<size_t> hypIdx = {},
const std::vector<size_t> embIdx = {}) {
using namespace keywords;
auto graph = hyps->graph();
Expr selectedHyps, selectedEmbs;
if(embIdx.empty()) {
selectedHyps = hyps;
selectedEmbs = graph->constant(shape={1, dimTrgEmb_},
init=inits::zeros);
}
else {
// @TODO : solve this better than reshaping!
selectedHyps = reshape(rows(hyps, hypIdx),
{1, hyps->shape()[1], 1, (int)hypIdx.size()});
auto yEmb = Embedding("Wemb_dec", dimTrgVoc_, dimTrgEmb_)(graph);
selectedEmbs = reshape(rows(yEmb, embIdx),
{1, yEmb->shape()[1], 1, (int)embIdx.size()});
}
Expr newHyps, logits;
std::tie(newHyps, logits) = step(selectedHyps, selectedEmbs, true);
return std::make_tuple(newHyps, logsoftmax(logits));
}
std::tuple<Expr, Expr> step(Expr yInStates, Expr yEmbeddings,
bool single = false) {
using namespace keywords;
auto yOutStates = (*rnn_)(yEmbeddings, yInStates);
auto yCtx = single ?
rnn_->getCell()->getLastContext() :
rnn_->getCell()->getContexts();
//// 2-layer feedforward network for outputs and cost
auto yLogitsL1 = Dense("ff_logit_l1", dimTrgEmb_,
activation=act::tanh,
normalize=normalize_)
(yEmbeddings, yOutStates, yCtx);
auto yLogitsL2 = Dense("ff_logit_l2", dimTrgVoc_)
(yLogitsL1);
return std::make_tuple(yOutStates, yLogitsL2);
}
Expr startState(Expr context, Expr mask) {
using namespace keywords;
auto meanContext = weighted_average(context, mask, axis=2);
auto start = Dense("ff_state",
dimDecState_,
activation=act::tanh,
normalize=normalize_)(meanContext);
return start;
}
Expr buildEncoder(Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch> batch) {
using namespace keywords;
graph->clear();
rnn_.reset();
setDims(graph, batch);
Expr xContext, xMask;
std::tie(xContext, xMask) = encoder(graph, batch);
auto attention = New<GlobalAttention>("decoder",
xContext, dimDecState_,
mask=xMask, normalize=normalize_);
rnn_ = New<RNN<CGRU>>(graph, "decoder",
dimTrgEmb_, dimDecState_,
attention,
normalize=normalize_);
return startState(xContext, xMask);
}
std::tuple<Expr, Expr, Expr> embeddings(Ptr<ExpressionGraph> graph,
Ptr<data::CorpusBatch> batch) {
using namespace keywords;
auto yEmb = Embedding("Wemb_dec", dimTrgVoc_, dimTrgEmb_)(graph);
Expr y, yMask, yIdx;
std::tie(y, yMask, yIdx) = prepareTarget(yEmb, batch, 1);
auto yEmpty = graph->zeros(shape={dimBatch_, dimTrgEmb_});
auto yShifted = concatenate({yEmpty, y}, axis=2);
return std::make_tuple(yShifted, yMask, yIdx);
}
Expr build(Ptr<ExpressionGraph> graph,
Ptr<data::CorpusBatch> batch) {
using namespace keywords;
graph->clear();
rnn_.reset();
setDims(graph, batch);
// Embeddings
auto xEmb = Embedding("Wemb", dimSrcVoc_, dimSrcEmb_)(graph);
auto yEmb = Embedding("Wemb_dec", dimTrgVoc_, dimTrgEmb_)(graph);
Expr xContext, xMask;
std::tie(xContext, xMask) = encoder(graph, batch);
auto yStartStates = startState(xContext, xMask);
Expr x, xMask;
Expr y, yMask, yIdx;
Expr yEmbeddings, yMask, yIdx;
std::tie(yEmbeddings, yMask, yIdx) = embeddings(graph, batch);
std::tie(x, xMask) = prepareSource(xEmb, batch, 0);
std::tie(y, yMask, yIdx) = prepareTarget(yEmb, batch, 1);
auto attention = New<GlobalAttention>("decoder",
xContext, dimDecState_,
mask=xMask, normalize=normalize_);
rnn_ = New<RNN<CGRU>>(graph, "decoder",
dimTrgEmb_, dimDecState_,
attention,
normalize=normalize_);
// Encoder
auto xContext = BiRNN<GRU>("encoder", dimEncState_)
(x, mask=xMask);
Expr yOutStates, yLogits;
std::tie(yOutStates, yLogits) = step(yStartStates, yEmbeddings);
auto xMeanContext = weighted_average(xContext, xMask, axis=2);
auto cost = CrossEntropyCost("cost")(yLogits, yIdx, mask=yMask);
// Decoder
auto yStart = Dense("ff_state",
dimDecState_,
activation=act::tanh)(xMeanContext);
auto yEmpty = graph->zeros(shape={dimBatch_, dimTrgEmb_});
auto yShifted = concatenate({yEmpty, y}, axis=2);
//auto yShifted = shift(y, 1, axis=2);
CGRU cgru({"decoder", xContext, dimDecState_, mask=xMask});
auto yLstm = RNN<CGRU>("decoder", dimDecState_, cgru)
(yShifted, yStart);
auto yCtx = cgru.getContexts();
//// 2-layer feedforward network for outputs and cost
auto ff_logit_l1 = Dense("ff_logit_l1", dimTrgEmb_,
activation=act::tanh)
(yShifted, yLstm, yCtx);
auto ff_logit_l2 = Dense("ff_logit_l2", dimTrgVoc_)
(ff_logit_l1);
auto cost = CrossEntropyCost("cost")
(ff_logit_l2, yIdx, mask=yMask);
return cost;
}
};

210
src/models/encdec.h Normal file
View File

@ -0,0 +1,210 @@
#pragma once
#include "data/corpus.h"
#include "training/config.h"
#include "graph/expression_graph.h"
#include "layers/rnn.h"
#include "layers/param_initializers.h"
#include "layers/generic.h"
#include "common/logging.h"
namespace marian {
class EncoderBase {
protected:
Ptr<Config> options_;
virtual std::tuple<Expr, Expr>
prepareSource(Expr emb, Ptr<data::CorpusBatch> batch, size_t index) {
using namespace keywords;
std::vector<size_t> indeces;
std::vector<float> mask;
for(auto& word : (*batch)[index]) {
for(auto i: word.first)
indeces.push_back(i);
for(auto m: word.second)
mask.push_back(m);
}
int dimBatch = batch->size();
int dimEmb = emb->shape()[1];
int dimWords = (int)(*batch)[index].size();
auto graph = emb->graph();
auto x = reshape(rows(emb, indeces), {dimBatch, dimEmb, dimWords});
auto xMask = graph->constant(shape={dimBatch, 1, dimWords},
init=inits::from_vector(mask));
return std::make_tuple(x, xMask);
}
public:
EncoderBase(Ptr<Config> options)
: options_(options) {}
virtual std::tuple<Expr, Expr>
build(Ptr<ExpressionGraph>, Ptr<data::CorpusBatch>, size_t = 0) = 0;
};
class DecoderBase {
protected:
Ptr<Config> options_;
virtual std::tuple<Expr, Expr, Expr>
prepareTarget(Expr emb, Ptr<data::CorpusBatch> batch, size_t index) {
using namespace keywords;
std::vector<size_t> indeces;
std::vector<float> mask;
std::vector<float> findeces;
for(int j = 0; j < (*batch)[index].size(); ++j) {
auto& trgWordBatch = (*batch)[index][j];
for(auto i : trgWordBatch.first) {
findeces.push_back((float)i);
if(j < (*batch)[index].size() - 1)
indeces.push_back(i);
}
for(auto m : trgWordBatch.second)
mask.push_back(m);
}
int dimBatch = batch->size();
int dimEmb = emb->shape()[1];
int dimWords = (int)(*batch)[index].size();
auto graph = emb->graph();
auto y = reshape(rows(emb, indeces),
{dimBatch, dimEmb, dimWords - 1});
auto yMask = graph->constant(shape={dimBatch, 1, dimWords},
init=inits::from_vector(mask));
auto yIdx = graph->constant(shape={(int)findeces.size(), 1},
init=inits::from_vector(findeces));
return std::make_tuple(y, yMask, yIdx);
}
public:
DecoderBase(Ptr<Config> options)
: options_(options) {}
virtual std::tuple<Expr, Expr, Expr>
groundTruth(Ptr<ExpressionGraph> graph,
Ptr<data::CorpusBatch> batch) {
using namespace keywords;
int dimBatch = batch->size();
int dimTrgVoc = options_->get<std::vector<int>>("dim-vocabs").back();
int dimTrgEmb = options_->get<int>("dim-emb");
auto yEmb = Embedding("Wemb_dec", dimTrgVoc, dimTrgEmb)(graph);
Expr y, yMask, yIdx;
std::tie(y, yMask, yIdx) = prepareTarget(yEmb, batch, 1);
auto yEmpty = graph->zeros(shape={dimBatch, dimTrgEmb});
auto yShifted = concatenate({yEmpty, y}, axis=2);
return std::make_tuple(yShifted, yMask, yIdx);
}
virtual Expr
buildStartState(Expr context, Expr mask) {
using namespace keywords;
auto meanContext = weighted_average(context, mask, axis=2);
bool layerNorm = options_->get<bool>("normalize");
auto start = Dense("ff_state",
options_->get<int>("dim-rnn"),
activation=act::tanh,
normalize=layerNorm)(meanContext);
return start;
}
virtual std::tuple<Expr, std::vector<Expr>>
step(Expr embeddings, std::vector<Expr> states,
Expr context, Expr contextMask, bool single=false) = 0;
};
template <class Encoder, class Decoder>
class Seq2Seq {
protected:
Ptr<Config> options_;
Ptr<EncoderBase> encoder_;
Ptr<DecoderBase> decoder_;
public:
Seq2Seq(Ptr<Config> options)
: options_(options),
encoder_(New<Encoder>(options)),
decoder_(New<Decoder>(options))
{}
virtual void load(Ptr<ExpressionGraph> graph,
const std::string& name) {
graph->load(name);
}
virtual void save(Ptr<ExpressionGraph> graph,
const std::string& name) {
graph->save(name);
}
virtual std::tuple<std::vector<Expr>, Expr, Expr>
buildEncoder(Ptr<ExpressionGraph> graph,
Ptr<data::CorpusBatch> batch) {
using namespace keywords;
graph->clear();
encoder_ = New<Encoder>(options_);
decoder_ = New<Decoder>(options_);
Expr srcContext, srcMask;
std::tie(srcContext, srcMask) = encoder_->build(graph, batch);
auto startState = decoder_->buildStartState(srcContext, srcMask);
size_t decoderLayers = options_->get<size_t>("layers-dec");
std::vector<Expr> startStates(decoderLayers, startState);
return std::make_tuple(startStates, srcContext, srcMask);
}
virtual std::tuple<Expr, std::vector<Expr>>
step(Expr embeddings,
std::vector<Expr> states,
Expr context,
Expr contextMask,
bool single=false) {
return decoder_->step(embeddings, states, context, contextMask, single);
}
virtual Expr build(Ptr<ExpressionGraph> graph,
Ptr<data::CorpusBatch> batch) {
using namespace keywords;
std::vector<Expr> startStates;
Expr srcContext, srcMask;
std::tie(startStates, srcContext, srcMask) = buildEncoder(graph, batch);
Expr trgEmbeddings, trgMask, trgIdx;
std::tie(trgEmbeddings, trgMask, trgIdx) = decoder_->groundTruth(graph, batch);
Expr trgLogits;
std::vector<Expr> trgStates;
std::tie(trgLogits, trgStates) = decoder_->step(trgEmbeddings,
startStates,
srcContext,
srcMask);
auto cost = CrossEntropyCost("cost")(trgLogits, trgIdx,
mask=trgMask);
return cost;
}
};
}

View File

@ -1,119 +0,0 @@
#pragma once
#include "graph/expression_graph.h"
namespace marian {
/**
* @brief Namespace for code related to managing models in Marian
*/
namespace models {
/**
* @brief Constructs an expression graph representing a feed-forward classifier.
*
* @param dims number of nodes in each layer of the feed-forward classifier
*
* @return a shared pointer to the newly constructed expression graph
*/
void FeedforwardClassifier(ExpressionGraphPtr g,
const std::vector<int>& dims,
size_t batchSize,
bool training = true) {
using namespace keywords;
std::cerr << "Building Multi-layer Feedforward network" << std::endl;
std::cerr << "\tLayer dimensions:";
for(auto d : dims)
std::cerr << " " << d;
std::cerr << std::endl;
boost::timer::cpu_timer timer;
// Construct a shared pointer to an empty expression graph
g->clear();
// Construct an input node called "x" and add it to the expression graph.
//
// For each observed data point, this input will hold a vector of values describing that data point.
// dims.front() specifies the size of this vector
//
// For example, in the MNIST task, for any given image in the training set,
// "x" would hold a vector of pixel values for that image.
//
// Because calculating over one observed data point at a time can be inefficient,
// it is customary to operate over a batch of observed data points at once.
//
// At this point, we do not know the batch size:
// whatevs therefore serves as a placeholder for the batch size, which will be specified later
//
// Once the batch size is known, "x" will represent a matrix with dimensions [batch_size, dims.front()].
// Each row of this matrix will correspond with the observed data vector for one observed data point.
auto x = name(g->input(shape={(int)batchSize, dims.front()}), "x");
// Construct an input node called "y" and add it to the expression graph.
//
// For each observed data point, this input will hold the ground truth label for that data point.
// dims.back() specifies the size of this vector
//
// For example, in the MNIST task, for any given image in the training set,
// "y" might hold one-hot vector representing which digit (0-9) is shown in that image
//
// Because calculating over one observed data point at a time can be inefficient,
// it is customary to operate over a batch of observed data points at once.
//
// At this point, we do not know the batch size:
// whatevs therefore serves as a placeholder for the batch size, which will be specified later
//
// Once the batch size is known, "y" will represent a matrix with dimensions [batch_size, dims.front()].
// Each row of this matrix will correspond with the ground truth data vector for one observed data point.
auto y = name(g->input(shape={(int)batchSize, 1}), "y");
std::vector<Expr> layers, weights, biases;
for(int i = 0; i < dims.size()-1; ++i) {
int in = dims[i];
int out = dims[i+1];
if(i == 0) {
// Create a dropout node as the parent of x,
// and place that dropout node as the value of layers[0]
layers.emplace_back(dropout(x, value=0.2));
} else {
// Multiply the matrix in layers[i-1] by the matrix in weights[i-1]
// Take the result, and perform matrix addition on biases[i-1].
// Wrap the result in rectified linear activation function,
// and finally wrap that in a dropout node
layers.emplace_back(dropout(relu(affine(layers.back(), weights.back(), biases.back())),
value=0.5));
}
// Construct a weight node for the outgoing connections from layer i
weights.emplace_back(
g->param("W" + std::to_string(i), {in, out},
init=inits::uniform()));
// Construct a bias node. By definition, a bias node stores the value 1.
// Therefore, we don't actually store the 1.
// Instead, the bias node object stores the weights on the connections
// that are outgoing from the bias node.
// These weights are initialized to zero
biases.emplace_back(
g->param("b" + std::to_string(i), {1, out},
init=inits::zeros));
}
// Perform matrix multiplication and addition for the last layer
auto last = affine(layers.back(), weights.back(), biases.back());
if(training) {
// Define a top-level node for training
auto cost = name(mean(cross_entropy(last, y), axis=0), "cost");
}
else {
// Define a top-level node for inference
auto scores = name(softmax(last), "scores");
}
std::cerr << "\tTotal time: " << timer.format(5, "%ws") << std::endl;
};
}
}

153
src/models/gnmt.h Normal file
View File

@ -0,0 +1,153 @@
#pragma once
#include "models/encdec.h"
#include "layers/attention.h"
namespace marian {
typedef AttentionCell<GRU, GlobalAttention, GRU> CGRU;
class EncoderGNMT : public EncoderBase {
public:
EncoderGNMT(Ptr<Config> options)
: EncoderBase(options) {}
std::tuple<Expr, Expr>
build(Ptr<ExpressionGraph> graph,
Ptr<data::CorpusBatch> batch,
size_t batchIdx = 0) {
using namespace keywords;
int dimSrcVoc = options_->get<std::vector<int>>("dim-vocabs")[batchIdx];
int dimSrcEmb = options_->get<int>("dim-emb");
int dimEncState = options_->get<int>("dim-rnn");
bool layerNorm = options_->get<bool>("normalize");
bool skipDepth = options_->get<bool>("skip");
size_t encoderLayers = options_->get<size_t>("layers-enc");
float dropoutRnn = options_->get<float>("dropout-rnn");
auto xEmb = Embedding("Wemb", dimSrcVoc, dimSrcEmb)(graph);
Expr x, xMask;
std::tie(x, xMask) = prepareSource(xEmb, batch, batchIdx);
auto xFw = RNN<GRU>(graph, "encoder_bi",
dimSrcEmb, dimEncState,
normalize=layerNorm,
dropout_prob=dropoutRnn)
(x);
auto xBw = RNN<GRU>(graph, "encoder_bi_r",
dimSrcEmb, dimEncState,
normalize=layerNorm,
direction=dir::backward,
dropout_prob=dropoutRnn)
(x, mask=xMask);
debug(xFw, "xFw");
if(encoderLayers > 1) {
auto xBi = concatenate({xFw, xBw}, axis=1);
Expr xContext;
std::vector<Expr> states;
std::tie(xContext, states)
= MLRNN<GRU>(graph, "encoder", encoderLayers - 1,
2 * dimEncState, dimEncState,
normalize=layerNorm,
skip=skipDepth,
dropout_prob=dropoutRnn)
(xBi);
return std::make_tuple(xContext, xMask);
}
else {
auto xContext = concatenate({xFw, xBw}, axis=1);
return std::make_tuple(xContext, xMask);
}
}
};
class DecoderGNMT : public DecoderBase {
private:
Ptr<GlobalAttention> attention_;
public:
DecoderGNMT(Ptr<Config> options)
: DecoderBase(options) {}
virtual std::tuple<Expr, std::vector<Expr>>
step(Expr embeddings,
std::vector<Expr> states,
Expr context,
Expr contextMask,
bool single) {
using namespace keywords;
int dimTrgVoc = options_->get<std::vector<int>>("dim-vocabs").back();
int dimTrgEmb = options_->get<int>("dim-emb");
int dimDecState = options_->get<int>("dim-rnn");
bool layerNorm = options_->get<bool>("normalize");
bool skipDepth = options_->get<bool>("skip");
size_t decoderLayers = options_->get<size_t>("layers-dec");
float dropoutRnn = options_->get<float>("dropout-rnn");
auto graph = embeddings->graph();
if(!attention_)
attention_ = New<GlobalAttention>("decoder",
context, dimDecState,
mask=contextMask,
normalize=layerNorm);
RNN<CGRU> rnnL1(graph, "decoder",
dimTrgEmb, dimDecState,
attention_,
normalize=layerNorm,
dropout_prob=dropoutRnn);
auto stateL1 = rnnL1(embeddings, states[0]);
auto alignedContext = single ?
rnnL1.getCell()->getLastContext() :
rnnL1.getCell()->getContexts();
std::vector<Expr> statesOut;
statesOut.push_back(stateL1);
Expr outputLn;
if(decoderLayers > 1) {
std::vector<Expr> statesIn;
for(int i = 1; i < states.size(); ++i)
statesIn.push_back(states[i]);
std::vector<Expr> statesLn;
std::tie(outputLn, statesLn) = MLRNN<GRU>(graph, "decoder",
decoderLayers - 1,
dimDecState, dimDecState,
normalize=layerNorm,
dropout_prob=dropoutRnn,
skip=skipDepth,
skip_first=skipDepth)
(stateL1, statesIn);
statesOut.insert(statesOut.end(),
statesLn.begin(), statesLn.end());
}
else {
outputLn = stateL1;
}
//// 2-layer feedforward network for outputs and cost
auto logitsL1 = Dense("ff_logit_l1", dimTrgEmb,
activation=act::tanh,
normalize=layerNorm)
(embeddings, outputLn, alignedContext);
auto logitsL2 = Dense("ff_logit_l2", dimTrgVoc)
(logitsL1);
return std::make_tuple(logitsL2, statesOut);
}
};
typedef Seq2Seq<EncoderGNMT, DecoderGNMT> GNMT;
}

View File

@ -4,20 +4,18 @@
#include <memory>
#include "kernels/tensor_operators.h"
#include "training/config.h"
#include "optimizers/clippers.h"
namespace marian {
// @TODO: modify computation graph to group all paramters in single matrix object.
// This will allow to perform a single large SGD update per batch. Currently there
// are as many updates as different parameters.
class OptimizerBase {
public:
template <typename ...Args>
OptimizerBase(Args... args)
: clipper_(Get(keywords::clip, nullptr, args...)) {}
OptimizerBase(float eta, Args... args)
: clipper_(Get(keywords::clip, nullptr, args...)),
eta_(eta) {}
float backpropUpdate(Ptr<ExpressionGraph> graph) {
graph->forward();
float cost = graph->topNode()->scalar();
@ -29,43 +27,46 @@ class OptimizerBase {
void update(Ptr<ExpressionGraph> graph) {
Tensor p = graph->params().vals();
Tensor g = graph->params().grads();
update(p, g);
update(p, g);
}
void update(Tensor params, Tensor grads) {
if(clipper_)
clipper_->clip(grads);
updateImpl(params, grads);
}
private:
void updateSchedule() {
eta_ *= 0.5;
LOG(info) << "Changing learning rate to " << eta_;
}
protected:
virtual void updateImpl(Tensor params, Tensor grads) = 0;
Ptr<ClipperBase> clipper_;
float eta_;
};
class Sgd : public OptimizerBase {
public:
template <typename ...Args>
Sgd(float eta=0.01, Args... args)
: OptimizerBase(args...), eta_(eta) {}
Sgd(float eta, Args... args)
: OptimizerBase(eta, args...) {}
private:
void updateImpl(Tensor params, Tensor grads) {
Element(_1 -= eta_ * _2, params, grads);
}
float eta_;
};
// @TODO: Add serialization for historic gradients and parameters
class Adagrad : public OptimizerBase {
public:
template <typename ...Args>
Adagrad(float eta=0.01, Args ...args)
: OptimizerBase(args...),
eta_(eta),
Adagrad(float eta, Args ...args)
: OptimizerBase(eta, args...),
eps_(Get(keywords::eps, 1e-8, args...))
{}
@ -80,7 +81,7 @@ class Adagrad : public OptimizerBase {
alloc_->allocate(gt_, {1, totalSize});
gt_->set(0);
}
Element(_1 += (_2 * _2),
gt_, grads);
@ -88,7 +89,6 @@ class Adagrad : public OptimizerBase {
params, gt_, grads);
}
float eta_;
float eps_;
Ptr<TensorAllocator> alloc_;
Tensor gt_;
@ -100,9 +100,8 @@ class Adagrad : public OptimizerBase {
class Adam : public OptimizerBase {
public:
template <typename ...Args>
Adam(float eta = 0.0001, Args ...args)
: OptimizerBase(args...),
eta_(eta),
Adam(float eta, Args ...args)
: OptimizerBase(eta, args...),
beta1_(Get(keywords::beta1, 0.9, args...)),
beta2_(Get(keywords::beta2, 0.999, args...)),
eps_(Get(keywords::eps, 1e-8, args...)),
@ -110,7 +109,7 @@ class Adam : public OptimizerBase {
{}
void updateImpl(Tensor params, Tensor grads) {
if(!mtAlloc_)
mtAlloc_ = New<TensorAllocator>(params->getDevice());
if(!vtAlloc_)
@ -128,9 +127,9 @@ class Adam : public OptimizerBase {
}
t_++;
float denom1 = 1 - pow(beta1_, t_);
float denom2 = 1 - pow(beta2_, t_);
float denom1 = 1 - std::pow(beta1_, t_);
float denom2 = 1 - std::pow(beta2_, t_);
Element(_1 = (beta1_ * _1) + ((1 - beta1_) * _2),
mt_, grads);
Element(_1 = (beta2_ * _1) + ((1 - beta2_) * (_2 * _2)),
@ -141,7 +140,6 @@ class Adam : public OptimizerBase {
}
private:
float eta_;
float beta1_;
float beta2_;
float eps_;
@ -158,4 +156,29 @@ Ptr<OptimizerBase> Optimizer(Args&& ...args) {
return Ptr<OptimizerBase>(new Algorithm(args...));
}
Ptr<OptimizerBase> Optimizer(Ptr<Config> options) {
Ptr<ClipperBase> clipper = nullptr;
float clipNorm = options->get<double>("clip-norm");
if(clipNorm > 0)
clipper = Clipper<Norm>(clipNorm);
float lrate = options->get<double>("learn-rate");
std::string opt = options->get<std::string>("optimizer");
if(opt == "sgd") {
return Optimizer<Sgd>(lrate, keywords::clip=clipper);
}
else if(opt == "adagrad") {
return Optimizer<Adagrad>(lrate, keywords::clip=clipper);
}
else if(opt == "adam") {
return Optimizer<Adam>(lrate, keywords::clip=clipper);
}
else {
UTIL_THROW2("Unknown optimizer: " << opt);
}
}
}

View File

@ -21,7 +21,6 @@
#include <cuda.h>
#include <cudnn.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
@ -87,7 +86,7 @@ void TensorBase::set(const std::vector<float> &v) {
void TensorBase::copyFrom(Tensor in) {
cudaSetDevice(device_);
CUDA_CHECK(cudaMemcpy(data_, in->data(), in->size() * sizeof(float),
CUDA_CHECK(cudaMemcpy(data_ , in->data() , in->size() * sizeof(float),
cudaMemcpyDefault));
cudaStreamSynchronize(0);
}
@ -100,7 +99,8 @@ std::string TensorBase::debug() {
for(int i = 1; i < shape_.size(); ++i)
strm << "x" << shape_[i];
strm << " size=" << shape_.elements()
<< " (" << shape_.elements() * sizeof(float) << "B)" << std::endl;
<< " (" << shape_.elements() * sizeof(float) << "B)";
strm << " device=" << device_ << std::endl;
// values
size_t totSize = shape_.elements();
@ -109,81 +109,90 @@ std::string TensorBase::debug() {
strm << std::fixed << std::setprecision(8) << std::setfill(' ');
for(size_t k = 0; k < shape()[2]; ++k) {
strm << "[ ";
if(shape()[0] > 10) {
for (size_t i = 0; i < shape()[0] && i < 3; ++i) {
if(i > 0)
strm << std::endl << " ";
for (size_t j = 0; j < shape()[1] && j < 3; ++j) {
strm << std::setw(12)
<< values[ i * shape().stride(0)
+ j * shape().stride(1)
+ k * shape().stride(2) ] << " ";
}
if(shape()[1] > 3)
strm << "... ";
for (size_t j = shape()[1] - 3; j < shape()[1]; ++j) {
strm << std::setw(12)
<< values[ i * shape().stride(0)
+ j * shape().stride(1)
+ k * shape().stride(2) ] << " ";
}
}
strm << std::endl << " ...";
for (size_t i = shape()[0] - 3; i < shape()[0]; ++i) {
if(i > 0)
strm << std::endl << " ";
for (size_t j = 0; j < shape()[1] && j < 3; ++j) {
strm << std::setw(12)
<< values[ i * shape().stride(0)
+ j * shape().stride(1)
+ k * shape().stride(2) ] << " ";
}
if(shape()[1] > 3)
strm << "... ";
for (size_t j = shape()[1] - 3; j < shape()[1]; ++j) {
strm << std::setw(12)
<< values[ i * shape().stride(0)
+ j * shape().stride(1)
+ k * shape().stride(2) ] << " ";
}
}
}
else {
for (size_t i = 0; i < shape()[0] && i < 10; ++i) {
if(i > 0)
strm << std::endl << " ";
for (size_t j = 0; j < shape()[1] && j < 3; ++j) {
strm << std::setw(12)
<< values[ i * shape().stride(0)
+ j * shape().stride(1)
+ k * shape().stride(2) ] << " ";
}
if(shape()[1] > 3)
strm << "... ";
for (size_t j = shape()[1] - 3; j < shape()[1]; ++j) {
strm << std::setw(12)
<< values[ i * shape().stride(0)
+ j * shape().stride(1)
+ k * shape().stride(2) ] << " ";
}
}
}
strm << "]" << std::endl;
for(size_t l = 0; l < shape()[3]; ++l) {
for(size_t k = 0; k < shape()[2]; ++k) {
strm << "[ ";
if(shape()[0] > 10) {
for (size_t i = 0; i < shape()[0] && i < 3; ++i) {
if(i > 0)
strm << std::endl << " ";
for (size_t j = 0; j < shape()[1] && j < 3; ++j) {
strm << std::setw(12)
<< values[ i * shape().stride(0)
+ j * shape().stride(1)
+ k * shape().stride(2)
+ l * shape().stride(3) ] << " ";
}
if(shape()[1] > 3)
strm << "... ";
for (size_t j = shape()[1] - 3; j < shape()[1]; ++j) {
strm << std::setw(12)
<< values[ i * shape().stride(0)
+ j * shape().stride(1)
+ k * shape().stride(2)
+ l * shape().stride(3) ] << " ";
}
}
strm << std::endl << " ...";
for (size_t i = shape()[0] - 3; i < shape()[0]; ++i) {
if(i > 0)
strm << std::endl << " ";
for (size_t j = 0; j < shape()[1] && j < 3; ++j) {
strm << std::setw(12)
<< values[ i * shape().stride(0)
+ j * shape().stride(1)
+ k * shape().stride(2)
+ l * shape().stride(3) ] << " ";
}
if(shape()[1] > 3)
strm << "... ";
for (size_t j = shape()[1] - 3; j < shape()[1]; ++j) {
strm << std::setw(12)
<< values[ i * shape().stride(0)
+ j * shape().stride(1)
+ k * shape().stride(2)
+ l * shape().stride(3) ] << " ";
}
}
}
else {
for (size_t i = 0; i < shape()[0] && i < 10; ++i) {
if(i > 0)
strm << std::endl << " ";
for (size_t j = 0; j < shape()[1] && j < 3; ++j) {
strm << std::setw(12)
<< values[ i * shape().stride(0)
+ j * shape().stride(1)
+ k * shape().stride(2)
+ l * shape().stride(3) ] << " ";
}
if(shape()[1] > 3)
strm << "... ";
for (size_t j = shape()[1] - 3; j < shape()[1]; ++j) {
strm << std::setw(12)
<< values[ i * shape().stride(0)
+ j * shape().stride(1)
+ k * shape().stride(2)
+ l * shape().stride(3) ] << " ";
}
}
}
strm << "]" << std::endl;
}
}
return strm.str();
}
DeviceGPU::~DeviceGPU() {
cudaSetDevice(device_);
if(data_)
CUDA_CHECK(cudaFree(data_));
cudaSetDevice(device_);
if(data_)
CUDA_CHECK(cudaFree(data_));
cudaDeviceSynchronize();
}
void DeviceGPU::reserve(size_t size) {
cudaSetDevice(device_);
UTIL_THROW_IF2(size < size_, "New size must be larger than old size");
if(data_) {

View File

@ -25,9 +25,6 @@
#include <iostream>
#include <sstream>
#include <iomanip>
#ifdef CUDNN
#include <cudnn.h>
#endif
#include "3rd_party/exception.h"
#include "common/definitions.h"
@ -40,30 +37,14 @@ class TensorBase : public std::enable_shared_from_this<TensorBase> {
float* data_;
Shape shape_;
size_t device_;
#ifdef CUDNN
cudnnTensorDescriptor_t cudnnDesc_;
#endif
public:
TensorBase(float* data, Shape shape, size_t device)
: data_(data), shape_(shape), device_(device)
{
#ifdef CUDNN
cudnnCreateTensorDescriptor(&cudnnDesc_);
cudnnSetTensor4dDescriptorEx(cudnnDesc_, CUDNN_DATA_FLOAT,
shape_[0], shape_[1],
shape_[2], shape_[3],
shape_.stride(0), shape_.stride(1),
shape_.stride(2), shape_.stride(3));
#endif
}
{}
~TensorBase()
{
#ifdef CUDNN
cudnnDestroyTensorDescriptor(cudnnDesc_);
#endif
}
{}
virtual void reset(float* data) {
data_ = data;
@ -90,6 +71,10 @@ class TensorBase : public std::enable_shared_from_this<TensorBase> {
return device_;
}
Tensor subtensor(int offset, int size){
return Tensor(new TensorBase(data_ + offset, {1, size}, device_ ));
}
float get(size_t i);
void set(size_t i, float value);
@ -102,12 +87,6 @@ class TensorBase : public std::enable_shared_from_this<TensorBase> {
void copyFrom(Tensor);
#ifdef CUDNN
cudnnTensorDescriptor_t& cudnn() {
return cudnnDesc_;
}
#endif
std::string debug();
};

View File

@ -94,9 +94,14 @@ class TensorAllocator {
gaps_.insert(lastGap_);
}
~TensorAllocator() {
clear();
}
void reserve(size_t elements = 0) {
float mult = elements / FLOATS + 1;
std::cerr << "Extending reserved space to " << mult * CHUNK << " MB" << std::endl;
LOG(memory) << "Extending reserved space to "
<< mult * CHUNK << " MB (device " << device_.getDevice() << ")";
size_t old = device_.capacity();
float* oldStart = device_.data();
@ -106,8 +111,8 @@ class TensorAllocator {
void reserveExact(size_t elements = 0) {
size_t mbytes = (elements * sizeof(float)) / MBYTE;
std::cerr << "Reserving space for " << elements
<< " floats (" << mbytes << " MB)" << std::endl;
LOG(memory) << "Reserving space for " << elements
<< " floats (" << mbytes << " MB, device " << device_.getDevice() << ")";
size_t old = device_.capacity();
float* oldStart = device_.data();

99
src/test/bn_test.cu Normal file
View File

@ -0,0 +1,99 @@
#include <iostream>
#include <cuda.h>
#include <string>
#include <vector>
#include <cmath>
#include <random>
#include <algorithm>
#include <iterator>
#include <iostream>
#include <functional>
#include "layers/generic.h"
#include "marian.h"
int main(int argc, char** argv) {
using namespace marian;
using namespace data;
using namespace keywords;
auto options = New<Config>(argc, argv, false);
int batchSize = 128;
std::vector<float> temp(batchSize * 3072);
std::vector<float> temp2(3072 * 3072);
std::vector<float> indeces(batchSize, 0.f);
std::random_device rnd_device;
// Specify the engine and distribution.
std::mt19937 mersenne_engine(rnd_device());
mersenne_engine.seed(1234);
std::uniform_real_distribution<float> dist(-1.f, 1.f);
auto gen = std::bind(dist, mersenne_engine);
std::generate(std::begin(temp), std::end(temp), gen);
std::generate(std::begin(temp2), std::end(temp2), gen);
{
auto graph = New<ExpressionGraph>();
graph->setDevice(0);
graph->reserveWorkspaceMB(128);
auto x = graph->param("x", {batchSize, 3072}, init=inits::from_vector(temp));
auto gamma = graph->param("gamma", {1, 3072}, init=inits::from_value(2.0));
auto beta = graph->param("beta", {1, 3072}, init=inits::zeros);
auto y = layer_norm(x, gamma, beta);
auto yLogitsL1 = Dense("ff_logit_l1", 512,
activation=act::tanh,
normalize=true)
(y, y, y);
auto yLogitsL2 = Dense("ff_logit_l2", 50000)
(yLogitsL1);
auto idx = graph->constant(shape={(int)indeces.size(), 1},
init=inits::from_vector(indeces));
auto ce = cross_entropy(yLogitsL2, idx);
auto cost = mean(sum(ce, keywords::axis=2), keywords::axis=0);
debug(x, "x");
debug(gamma, "gamma");
debug(beta, "beta");
graph->forward();
graph->backward();
}
/*{
auto graph = New<ExpressionGraph>();
graph->setDevice(0);
graph->reserveWorkspaceMB(128);
auto x = graph->param("x", {batchSize, 3072}, init=inits::from_vector(temp));
auto gamma = graph->param("gamma", {1, 3072}, init=inits::from_value(2.0));
auto beta = graph->param("beta", {1, 3072}, init=inits::zeros);
auto y = layer_norm(x, gamma, beta);
auto w = graph->param("w", {3072, 3072}, init=inits::from_vector(temp2));
auto y2 = tanh(layer_norm(dot(y, w), gamma, beta));
auto idx = graph->constant(shape={(int)indeces.size(), 1},
init=inits::from_vector(indeces));
auto ce = cross_entropy(y2, idx);
auto cost = mean(sum(ce, keywords::axis=2), keywords::axis=0);
debug(x, "x");
debug(gamma, "gamma");
debug(beta, "beta");
graph->forward();
graph->backward();
}*/
return 0;
}

View File

@ -6,68 +6,26 @@
#include <boost/timer/timer.hpp>
#include <boost/chrono.hpp>
#include "tensors/tensor_allocator.h"
#include "tensors/tensor.h"
#include "kernels/tensor_operators.h"
#include "layers/dropout.h"
#include "kernels/dropout_cudnn.h"
#include "training/config.h"
#include "marian.h"
#include "layers/param_initializers.h"
using namespace marian;
using namespace keywords;
int main() {
int cudaDevice = 0;
TensorAllocator* params = new TensorAllocator(cudaDevice);
int main(int argc, char** argv) {
auto c = New<Config>(argc, argv);
cublasHandle_t handle = create_handle(cudaDevice);
int rows = 64;
int cols = 2048;
int layers = 64;
std::cerr << "Number of elements in tensor: " << rows * cols * layers << std::endl;
int rep = 1000;
const float prob = 0.5f;
Tensor dropoutMatrix;
params->allocate(dropoutMatrix, {rows, cols, layers});
DropoutGenerator dropout(0);
cudaStreamSynchronize(0);
boost::timer::cpu_timer timer;
for (int i = 0; i < rep;++i) {
dropout.Generate(dropoutMatrix, prob);
auto g = New<ExpressionGraph>();
g->setDevice(0);
g->reserveWorkspaceMB(512);
for(int i = 0; i < 10; ++i) {
g->clear();
auto mask = g->dropout(0.2, {10, 3072});
debug(mask, "mask");
g->forward();
}
cudaDeviceSynchronize();
std::cerr << "DropoutGenerator: " << rep << " repetitions: " << timer.format(5, "%ws") << std::endl;
Tensor cudnnInTensor, cudnnOutTensor;
params->allocate(cudnnInTensor, {rows, cols, layers});
params->allocate(cudnnOutTensor, {rows, cols, layers});
void* states_;
void* space_;
size_t spaceSize_;
cudnnDropoutDescriptor_t dropDesc_;
CudnnDropoutPrepare(cudnnInTensor, prob, &dropDesc_, &space_, &spaceSize_, &states_, (size_t)1234);
cudaStreamSynchronize(0);
cudaDeviceSynchronize();
timer.start();
for (int i = 0; i < rep; ++i) {
CudnnDropoutForward(dropDesc_, space_, spaceSize_, cudnnInTensor, cudnnOutTensor);
}
cudaDeviceSynchronize();
std::cerr << "CUDNN Dropout: " << rep << " repetitions: " << timer.format(5, "%ws") << std::endl;
return 0;
}

View File

@ -7,38 +7,42 @@
#include <boost/chrono.hpp>
#include "marian.h"
#include "training/config.h"
#include "optimizers/optimizers.h"
#include "optimizers/clippers.h"
#include "data/batch_generator.h"
#include "data/corpus.h"
#include "models/nematus.h"
#include "models/gnmt.h"
int main(int argc, char** argv) {
using namespace marian;
using namespace data;
auto options = New<Config>(argc, argv, false);
std::vector<std::string> files =
{"../test/mini.de",
"../test/mini.en"};
{"../testln/mini.en",
"../testln/mini.de"};
std::vector<std::string> vocab =
{"../test/vocab.de.json",
"../test/vocab.en.json"};
{"../benchmark/marian32K/train.tok.true.bpe.en.json",
"../benchmark/marian32K/train.tok.true.bpe.de.json"};
std::vector<int> maxVocab = { 50000, 50000 };
YAML::Node& c = options->get();
c["train-sets"] = files;
c["vocabs"] = vocab;
auto corpus = DataSet<Corpus>(files, vocab, maxVocab, 50);
BatchGenerator<Corpus> bg(corpus, 10, 20);
auto corpus = DataSet<Corpus>(options);
BatchGenerator<Corpus> bg(corpus, options);
auto graph = New<ExpressionGraph>();
graph->setDevice(std::atoi(argv[1]));
graph->setDevice(1);
auto nematus = New<Nematus>();
nematus->load(graph, "../test/model.npz");
auto encdec = New<GNMT>(options);
encdec->load(graph, "../benchmark/marian32K/modelML6.200000.npz");
graph->reserveWorkspaceMB(128);
float sum = 0;
boost::timer::cpu_timer timer;
size_t batches = 1;
for(int i = 0; i < 1; ++i) {
@ -47,39 +51,15 @@ int main(int argc, char** argv) {
auto batch = bg.next();
batch->debug();
auto costNode = nematus->build(graph, batch);
for(auto p : graph->params())
debug(p, p->name());
auto costNode = encdec->build(graph, batch);
//for(auto p : graph->params())
//debug(p, p->name());
debug(costNode, "cost");
graph->graphviz("debug.dot");
//graph->graphviz("debug.dot");
graph->forward();
graph->backward();
float cost = costNode->val()->scalar();
sum += cost;
if(batches % 100 == 0) {
std::cout << std::setfill(' ')
<< "Epoch " << i
<< " Update " << batches
<< " Cost " << std::setw(7) << std::setprecision(6) << cost
<< " UD " << timer.format(2, "%ws");
float seconds = std::stof(timer.format(5, "%w"));
float sentences = 100 * batch->size() / seconds;
std::cout << " " << std::setw(5)
<< std::setprecision(4)
<< sentences
<< " sentences/s" << std::endl;
timer.start();
}
if(batches % 10000 == 0)
nematus->save(graph, "../test/model.marian." + std::to_string(batches) + ".npz");
//graph->backward();
batches++;
}

View File

@ -0,0 +1,247 @@
#include <algorithm>
#include <chrono>
#include <iomanip>
#include <string>
#include <cstdio>
#include <boost/timer/timer.hpp>
#include <boost/chrono.hpp>
#include "marian.h"
#include "training/config.h"
#include "optimizers/optimizers.h"
#include "optimizers/clippers.h"
#include "data/batch_generator.h"
#include "data/corpus.h"
#include "models/gnmt.h"
#include "translator/nth_element.h"
#include "common/history.h"
namespace marian {
template <class Builder>
class BeamSearch {
private:
Ptr<Builder> builder_;
size_t beamSize_;
cudaStream_t stream_{0};
public:
BeamSearch(Ptr<Builder> builder)
: builder_(builder),
beamSize_(12)
{}
Beam toHyps(const std::vector<uint> keys,
const std::vector<float> costs,
size_t vocabSize,
const Beam& beam) {
Beam newBeam;
for(int i = 0; i < keys.size(); ++i) {
int embIdx = keys[i] % vocabSize;
int hypIdx = keys[i] / vocabSize;
float cost = costs[i];
newBeam.push_back(
New<Hypothesis>(beam[hypIdx], embIdx, hypIdx, cost));
}
return newBeam;
}
Beam pruneBeam(const Beam& beam) {
Beam newBeam;
for(auto hyp : beam) {
if(hyp->GetWord() > 0) {
newBeam.push_back(hyp);
}
}
return newBeam;
}
std::tuple<std::vector<Expr>, Expr>
step(std::vector<Expr> hyps,
Expr srcContext,
Expr srcMask,
const std::vector<size_t> hypIdx = {},
const std::vector<size_t> embIdx = {}) {
using namespace keywords;
auto graph = hyps[0]->graph();
// @TODO: not hard-coded!
int dimTrgEmb_ = 512;
int dimTrgVoc_ = 50000;
std::vector<Expr> selectedHyps;
Expr selectedEmbs;
if(embIdx.empty()) {
selectedHyps = hyps;
selectedEmbs = graph->constant(shape={1, dimTrgEmb_},
init=inits::zeros);
}
else {
// @TODO : solve this better than reshaping!
for(auto h : hyps)
selectedHyps.push_back(
reshape(rows(h, hypIdx), {1, h->shape()[1], 1, (int)hypIdx.size()}));
auto yEmb = Embedding("Wemb_dec", dimTrgVoc_, dimTrgEmb_)(graph);
selectedEmbs = reshape(rows(yEmb, embIdx),
{1, yEmb->shape()[1], 1, (int)embIdx.size()});
}
Expr logits;
std::vector<Expr> newHyps;
std::tie(logits, newHyps) = builder_->step(selectedEmbs,
selectedHyps,
srcContext,
srcMask,
true);
return std::make_tuple(newHyps, logsoftmax(logits));
}
std::tuple<std::vector<Expr>, Expr>
step(std::vector<Expr> hyps,
Expr srcContext,
Expr srcMask,
const Beam& beam) {
std::vector<size_t> hypIndeces;
std::vector<size_t> embIndeces;
std::vector<float> beamCosts;
for(auto hyp : beam) {
hypIndeces.push_back(hyp->GetPrevStateIndex());
embIndeces.push_back(hyp->GetWord());
beamCosts.push_back(hyp->GetCost());
}
auto graph = hyps[0]->graph();
auto costs = graph->constant(keywords::shape={1, 1, 1, (int)beamCosts.size()},
keywords::init=inits::from_vector(beamCosts));
std::vector<Expr> newHyps;
Expr probs;
std::tie(newHyps, probs) = step(hyps,
srcContext,
srcMask,
hypIndeces,
embIndeces);
probs = probs + costs;
return std::make_tuple(newHyps, probs);
}
Ptr<History> search(Ptr<ExpressionGraph> graph,
Ptr<data::CorpusBatch> batch) {
std::vector<Expr> startStates;
Expr srcContext, srcMask;
std::tie(startStates, srcContext, srcMask)
= builder_->buildEncoder(graph, batch);
size_t pos = 0;
auto history = New<History>(0);
Beam beam(1, New<Hypothesis>());
bool first = true;
bool final = false;
std::vector<size_t> beamSizes(1, beamSize_);
auto nth = New<NthElement>(beamSize_, batch->size(), stream_);
history->Add(beam);
std::vector<Expr> hyps;
Expr probs;
do {
if(first) {
std::tie(hyps, probs) = step(startStates,
srcContext,
srcMask);
pos = graph->forward();
}
else {
std::tie(hyps, probs) = step(hyps,
srcContext,
srcMask,
beam);
beamSizes[0] = beam.size();
pos = graph->forward(pos);
}
size_t dimTrgVoc = probs->shape()[1];
std::vector<unsigned> outKeys;
std::vector<float> outCosts;
for(int i = 0; i < probs->shape()[3]; i++) {
probs->val()->set(i * dimTrgVoc + 1, std::numeric_limits<float>::lowest());
}
nth->getNBestList(beamSizes, probs->val(),
outCosts, outKeys, first);
first = false;
beam = toHyps(outKeys, outCosts, dimTrgVoc, beam);
final = history->size() >= 3 * batch->words();
history->Add(beam, final);
beam = pruneBeam(beam);
} while(!beam.empty() && !final);
return history;
}
};
}
int main(int argc, char** argv) {
using namespace marian;
using namespace data;
auto options = New<Config>(argc, argv, false);
std::vector<std::string> files =
{"../benchmark/marian32K/newstest2016.tok.true.bpe.en"};
//{"../benchmark/marian32K/test.txt"};
std::vector<std::string> vocab =
{"../benchmark/marian32K/train.tok.true.bpe.en.json"};
YAML::Node& c = options->get();
c["train-sets"] = files;
c["vocabs"] = vocab;
auto corpus = DataSet<Corpus>(options);
BatchGenerator<Corpus> bg(corpus, options);
auto graph = New<ExpressionGraph>();
graph->setDevice(1);
auto target = New<Vocab>();
target->load("../benchmark/marian32K/train.tok.true.bpe.de.json", 50000);
auto encdec = New<GNMT>(options);
encdec->load(graph, "../benchmark/marian32K/modelML6.200000.npz");
graph->reserveWorkspaceMB(128);
boost::timer::cpu_timer timer;
bg.prepare(false);
while(bg) {
auto batch = bg.next();
auto search = New<BeamSearch<GNMT>>(encdec);
auto history = search->search(graph, batch);
auto results = history->NBest(1);
for(auto r : results) {
for(auto w : r.first)
if(w != 0)
std::cout << (*target)[w] << " ";
//std::cout << r.second->GetCost() << std::endl;
std::cout << std::endl;
}
}
std::cerr << timer.format(5, "%ws") << std::endl;
return 0;
}

View File

@ -1,123 +1,86 @@
#include <iostream>
#include <boost/timer/timer.hpp>
//#include "tensors/tensor_allocator.h"
//#include "tensors/tensor_gpu.h"
//#include "kernels/tensor_operators.h"
//#include "kernels/thrust_functions.h"
#include "data/corpus.h"
#include "data/batch_generator.h"
#include "tensors/tensor_allocator.h"
#include "tensors/tensor.h"
#include "kernels/tensor_operators.h"
#include "kernels/thrust_functions.h"
#include "common/logging.h"
using namespace marian;
int main() {
Logger memory{stderrLogger("memory", "[%Y-%m-%d %T] [memory] %v")};
std::vector<std::string> files =
{"../benchmark/train.tok.true.en",
"../benchmark/train.tok.true.en",
"../benchmark/train.tok.true.de"};
Ptr<TensorAllocator> params = New<TensorAllocator>(0);
std::vector<std::string> vocab =
{"../benchmark/train.tok.true.en.json",
"../benchmark/train.tok.true.en.json",
"../benchmark/train.tok.true.de.json"};
cublasHandle_t handle = create_handle(0);
std::vector<int> maxVocab = { 50000, 50000, 50000 };
int words = 64;
int batch = 128;
int hidden = 4096;
using namespace data;
auto corpus = New<Corpus>(files, vocab, maxVocab, 50);
BatchGenerator<Corpus> bg(corpus, 64, 20);
Tensor mappedState;
params->allocate(mappedState, {batch, hidden, 1});
mappedState->set(0.001);
bg.prepare();
Tensor mappedContext;
params->allocate(mappedContext, {batch, hidden, words});
mappedContext->set(0.001);
Tensor va;
params->allocate(va, {hidden, 1});
va->set(0.001);
Tensor out1;
params->allocate(out1, {batch, hidden, words});
out1->set(0);
Tensor gMappedState;
params->allocate(gMappedState, {batch, hidden, 1});
gMappedState->set(0);
Tensor gMappedContext;
params->allocate(gMappedContext, {batch, hidden, words});
gMappedContext->set(0.001);
Tensor gVa;
params->allocate(gVa, {hidden, 1});
va->set(0.001);
Tensor gOut1;
params->allocate(gOut1, {batch, hidden, words});
out1->set(0);
Tensor out2;
params->allocate(out2, {batch, 1, words});
out2->set(0);
boost::timer::cpu_timer timer;
for(int i = 0; i < 5000; ++i) {
Element(_1 = Tanh(_2 + _3), out1, mappedState, mappedContext);
Prod(handle, out2, out1, va, false, false, 0);
Prod(handle, gOut1, out2, va, false, true, 1.0f);
Prod(handle, gVa, out1, out2, true, false, 1.0f);
Add(_1 * (1.f - (_2 *_2)), gMappedState, out1, out1);
Add(_1 * (1.f - (_2 *_2)), gMappedContext, out1, out1);
cudaStreamSynchronize(0);
size_t i = 0;
size_t samples = 0;
while(bg) {
auto batch = bg.next();
if(i && i % 10000 == 0)
std::cerr << "[" << i << "/" << samples << "]" << std::endl;
if(i % 100 == 0)
std::cerr << ".";
i++;
samples += batch->size();
std::cout << "." << std::flush;
}
std::cout << timer.format(5, "%ws") << std::endl;
//TensorAllocator params = newTensorAllocator<DeviceGPU>();
//
//cublasHandle_t handle = create_handle();
//
//int words = 64;
//int batch = 128;
//int hidden = 4096;
//
//Tensor mappedState;
//params->allocate(mappedState, {batch, hidden, 1});
//mappedState->set(0.001);
//
//Tensor mappedContext;
//params->allocate(mappedContext, {batch, hidden, words});
//mappedContext->set(0.001);
//
//Tensor va;
//params->allocate(va, {hidden, 1});
//va->set(0.001);
//
//Tensor out1;
//params->allocate(out1, {batch, hidden, words});
//out1->set(0);
//
//Tensor gMappedState;
//params->allocate(gMappedState, {batch, hidden, 1});
//gMappedState->set(0);
//
//Tensor gMappedContext;
//params->allocate(gMappedContext, {batch, hidden, words});
//gMappedContext->set(0.001);
//
//Tensor gVa;
//params->allocate(gVa, {hidden, 1});
//va->set(0.001);
//
//Tensor gOut1;
//params->allocate(gOut1, {batch, hidden, words});
//out1->set(0);
//
//Tensor out2;
//params->allocate(out2, {batch, 1, words});
//out2->set(0);
//
//boost::timer::cpu_timer timer;
//for(int i = 0; i < 5000; ++i) {
// Element(_1 = Tanh(_2 + _3), out1, mappedState, mappedContext);
// Prod(handle, out2, out1, va, false, false, 0);
// Prod(handle, gOut1, out2, va, false, true, 1.0f);
// Prod(handle, gVa, out1, out2, true, false, 1.0f);
// Add(_1 * (1.f - (_2 *_2)), gMappedState, out1, out1);
// Add(_1 * (1.f - (_2 *_2)), gMappedContext, out1, out1);
// cudaStreamSynchronize(0);
//
// if(i % 100 == 0)
// std::cout << "." << std::flush;
//}
//std::cout << timer.format(5, "%ws") << std::endl;
//
//boost::timer::cpu_timer timer2;
//for(int i = 0; i < 5000; ++i) {
// Att(out2, mappedContext, mappedState, va);
// AttBack(gMappedContext, gMappedState, gVa,
// mappedContext, mappedState, va, out2);
// cudaStreamSynchronize(0);
// if(i % 100 == 0)
// std::cout << "." << std::flush;
//}
//std::cout << timer2.format(5, "%ws") << std::endl;
boost::timer::cpu_timer timer2;
for(int i = 0; i < 5000; ++i) {
Att(va, out2, mappedContext, mappedState, nullptr);
AttBack(gVa, gMappedContext, gMappedState, nullptr,
va, mappedContext, mappedState, out2, nullptr);
cudaStreamSynchronize(0);
if(i % 100 == 0)
std::cout << "." << std::flush;
}
std::cout << timer2.format(5, "%ws") << std::endl;
return 0;
}

View File

@ -1,8 +1,10 @@
#include "command/config.h"
#include <set>
#include <string>
#include <boost/algorithm/string.hpp>
#include "training/config.h"
#include "common/file_stream.h"
#include "common/logging.h"
#define SET_OPTION(key, type) \
do { if(!vm_[key].defaulted() || !config_[key]) { \
@ -14,6 +16,8 @@ do { if(vm_.count(key) > 0) { \
config_[key] = vm_[key].as<type>(); \
}} while(0)
namespace marian {
bool Config::has(const std::string& key) const {
return config_[key];
}
@ -26,6 +30,10 @@ const YAML::Node& Config::get() const {
return config_;
}
YAML::Node& Config::get() {
return config_;
}
void ProcessPaths(YAML::Node& node, const boost::filesystem::path& configPath, bool isPath) {
using namespace boost::filesystem;
std::set<std::string> paths = {"model", "trainsets", "vocabs"};
@ -68,24 +76,18 @@ void ProcessPaths(YAML::Node& node, const boost::filesystem::path& configPath, b
}
void Config::validate() const {
if (has("trainsets")) {
std::vector<std::string> tmp = get<std::vector<std::string>>("trainsets");
if (tmp.size() != 2) {
std::cerr << "No trainsets!" << std::endl;
exit(1);
}
} else {
std::cerr << "No trainsets!" << std::endl;
exit(1);
UTIL_THROW_IF2(!has("train-sets")
|| get<std::vector<std::string>>("train-sets").empty(),
"No train sets given in config file or on command line");
if(has("vocabs")) {
UTIL_THROW_IF2(get<std::vector<std::string>>("vocabs").size() !=
get<std::vector<std::string>>("train-sets").size(),
"There should be as many vocabularies as training sets");
}
if (has("vocabs")) {
if (get<std::vector<std::string>>("vocabs").size() != 2) {
std::cerr << "No vocab files!" << std::endl;
exit(1);
}
} else {
std::cerr << "No vocab files!" << std::endl;
exit(1);
if(has("valid-sets")) {
UTIL_THROW_IF2(get<std::vector<std::string>>("valid-sets").size() !=
get<std::vector<std::string>>("train-sets").size(),
"There should be as many validation sets as training sets");
}
}
@ -122,7 +124,7 @@ void OutputRec(const YAML::Node node, YAML::Emitter& out) {
}
}
void Config::addOptions(int argc, char** argv) {
void Config::addOptions(int argc, char** argv, bool doValidate) {
std::string configPath;
namespace po = boost::program_options;
@ -133,50 +135,88 @@ void Config::addOptions(int argc, char** argv) {
"Configuration file")
("model,m", po::value<std::string>()->default_value("./model"),
"Path prefix for model to be saved")
("device,d", po::value<std::vector<int>>()
->multitoken()
->default_value(std::vector<int>({0}), "0"),
"Use device(s) no. arg")
("init,i", po::value<std::string>(),
"Load weights from arg before training")
("overwrite", po::value<bool>()->default_value(false),
"Overwrite model with following checkpoints")
("trainsets,t", po::value<std::vector<std::string>>()->multitoken(),
("train-sets,t", po::value<std::vector<std::string>>()->multitoken(),
"Paths to training corpora: source target")
("vocabs,v", po::value<std::vector<std::string>>()->multitoken(),
"Paths to vocabulary files, have to correspond to --trainsets")
"Paths to vocabulary files have to correspond to --trainsets. "
"If this parameter is not supplied we look for vocabulary files "
"source.{yml,json} and target.{yml,json}. "
"If these files do not exists they are created.")
("max-length", po::value<size_t>()->default_value(50),
"Maximum length of a sentence in a training sentence pair")
("after-epochs,e", po::value<size_t>()->default_value(0),
"Finish after this many epochs, 0 is infinity")
("after-batches", po::value<size_t>()->default_value(0),
"Finish after this many batch updates, 0 is infinity")
("disp-freq", po::value<size_t>()->default_value(100),
("disp-freq", po::value<size_t>()->default_value(1000),
"Display information every arg updates")
("save-freq", po::value<size_t>()->default_value(30000),
("save-freq", po::value<size_t>()->default_value(10000),
"Save model file every arg updates")
("no-shuffle", po::value<bool>()->zero_tokens()->default_value(false),
"Skip shuffling of training data before each epoch")
("workspace,w", po::value<size_t>()->default_value(2048),
"Preallocate arg MB of work space")
("log", po::value<std::string>(),
"Log training process information to file given by arg")
;
po::options_description hyper("Search options");
hyper.add_options()
("max-length", po::value<size_t>()->default_value(50),
"Maximum length of a sentence in a training sentence pair")
("mini-batch,b", po::value<int>()->default_value(40),
"Size of mini-batch used during update")
("maxi-batch", po::value<int>()->default_value(20),
"Number of batches to preload for length-based sorting")
("lrate,l", po::value<double>()->default_value(0.0002),
"Learning rate for Adam algorithm")
("clip-norm", po::value<double>()->default_value(1.f),
"Clip gradient norm to arg (0 to disable)")
po::options_description valid("Validation set options");
valid.add_options()
("valid-sets", po::value<std::vector<std::string>>()->multitoken(),
"Paths to validation corpora: source target")
("valid-freq", po::value<size_t>()->default_value(10000),
"Validate model every arg updates")
("valid-metrics", po::value<std::vector<std::string>>()
->multitoken()
->default_value(std::vector<std::string>({"cross-entropy"}),
"cross-entropy"),
"Metric to use during validation: cross-entropy, perplexity. "
"Multiple metrics can be specified")
("early-stopping", po::value<size_t>()->default_value(10),
"Stop if the first validation metric does not improve for arg consecutive "
"validation steps")
("valid-log", po::value<std::string>(),
"Log validation scores to file given by arg")
;
po::options_description model("Model options");
model.add_options()
("dim-vocabs", po::value<std::vector<int>>()
->multitoken()
->default_value(std::vector<int>({50000, 50000}), "50000 50000"),
"Maximum items in vocabulary ordered by rank")
("dim-emb", po::value<int>()->default_value(512), "Size of embedding vector")
("dim-rnn", po::value<int>()->default_value(1024), "Size of rnn hidden state")
("no-shuffle", po::value<bool>()->zero_tokens()->default_value(false),
"Skip shuffling of training data before each epoch")
("layers-enc", po::value<int>()->default_value(8), "Number of encoder layers")
("layers-dec", po::value<int>()->default_value(8), "Number of decoder layers")
("skip", po::value<bool>()->zero_tokens()->default_value(false),
"Use skip connections")
("normalize", po::value<bool>()->zero_tokens()->default_value(false),
"Enable layer normalization")
("dropout-rnn", po::value<float>()->default_value(0),
"Scaling dropout along rnn layers and time (0 = no dropout)")
;
po::options_description opt("Optimizer options");
opt.add_options()
("mini-batch,b", po::value<int>()->default_value(64),
"Size of mini-batch used during update")
("maxi-batch", po::value<int>()->default_value(100),
"Number of batches to preload for length-based sorting")
("optimizer,o", po::value<std::string>()->default_value("adam"),
"Optimization algorithm (possible values: sgd, adagrad, adam")
("learn-rate,l", po::value<double>()->default_value(0.0001),
"Learning rate")
("clip-norm", po::value<double>()->default_value(1.f),
"Clip gradient norm to arg (0 to disable)")
("device,d", po::value<std::vector<int>>()
->multitoken()
->default_value(std::vector<int>({0}), "0"),
"GPUs to use for training. Asynchronous SGD is used with multiple devices.")
;
po::options_description configuration("Configuration meta options");
@ -191,7 +231,9 @@ void Config::addOptions(int argc, char** argv) {
po::options_description cmdline_options("Allowed options");
cmdline_options.add(general);
cmdline_options.add(hyper);
cmdline_options.add(valid);
cmdline_options.add(model);
cmdline_options.add(opt);
cmdline_options.add(configuration);
boost::program_options::variables_map vm_;
@ -223,14 +265,25 @@ void Config::addOptions(int argc, char** argv) {
SET_OPTION("device", std::vector<int>);
SET_OPTION_NONDEFAULT("init", std::string);
SET_OPTION("overwrite", bool);
SET_OPTION_NONDEFAULT("log", std::string);
// SET_OPTION_NONDEFAULT("trainsets", std::vector<std::string>);
if (!vm_["trainsets"].empty()) {
config_["trainsets"] = vm_["trainsets"].as<std::vector<std::string>>();
if (!vm_["train-sets"].empty()) {
config_["train-sets"] = vm_["train-sets"].as<std::vector<std::string>>();
}
if (!vm_["valid-sets"].empty()) {
config_["valid-sets"] = vm_["valid-sets"].as<std::vector<std::string>>();
}
if (!vm_["vocabs"].empty()) {
config_["vocabs"] = vm_["vocabs"].as<std::vector<std::string>>();
}
SET_OPTION_NONDEFAULT("valid-sets", std::vector<std::string>);
SET_OPTION("valid-freq", size_t);
SET_OPTION("valid-metrics", std::vector<std::string>);
SET_OPTION("early-stopping", size_t);
SET_OPTION_NONDEFAULT("valid-log", std::string);
// SET_OPTION_NONDEFAULT("vocabs", std::vector<std::string>);
SET_OPTION("after-epochs", size_t);
SET_OPTION("after-batches", size_t);
@ -242,14 +295,22 @@ void Config::addOptions(int argc, char** argv) {
SET_OPTION("max-length", size_t);
SET_OPTION("mini-batch", int);
SET_OPTION("maxi-batch", int);
SET_OPTION("lrate", double);
SET_OPTION("optimizer", std::string);
SET_OPTION("learn-rate", double);
SET_OPTION("clip-norm", double);
SET_OPTION("dim-vocabs", std::vector<int>);
SET_OPTION("layers-enc", int);
SET_OPTION("layers-dec", int);
SET_OPTION("dim-emb", int);
SET_OPTION("dim-rnn", int);
SET_OPTION("no-shuffle", bool);
validate();
SET_OPTION("normalize", bool);
SET_OPTION("dropout-rnn", float);
SET_OPTION("skip", bool);
if(doValidate)
validate();
if (get<bool>("relative-paths") && !vm_["dump-config"].as<bool>())
ProcessPaths(config_, boost::filesystem::path{configPath}.parent_path(), false);
@ -263,9 +324,17 @@ void Config::addOptions(int argc, char** argv) {
}
void Config::logOptions() {
std::stringstream ss;
void Config::log() {
createLoggers(*this);
YAML::Emitter out;
OutputRec(config_, out);
std::cerr << "Options: \n" << out.c_str() << std::endl;
std::string conf = out.c_str();
std::vector<std::string> results;
boost::algorithm::split(results, conf, boost::is_any_of("\n"));
for(auto &r : results)
LOG(config) << r;
}
}

View File

@ -1,12 +1,17 @@
#pragma once
#include <yaml-cpp/yaml.h>
#include <boost/program_options.hpp>
#include "3rd_party/yaml-cpp/yaml.h"
#include "common/logging.h"
namespace marian {
class Config {
public:
Config(int argc, char** argv) {
addOptions(argc, argv);
Config(int argc, char** argv, bool validate = true) {
addOptions(argc, argv, validate);
log();
}
bool has(const std::string& key) const;
@ -19,12 +24,14 @@ class Config {
}
const YAML::Node& get() const;
YAML::Node& get();
YAML::Node operator[](const std::string& key) const {
return get(key);
}
void addOptions(int argc, char** argv);
void logOptions();
void addOptions(int argc, char** argv, bool validate);
void log();
void validate() const;
template <class OStream>
@ -37,3 +44,5 @@ class Config {
std::string inputPath;
YAML::Node config_;
};
}

View File

@ -1,242 +1,267 @@
#pragma once
#include <thread>
#include <future>
#include "common/definitions.h"
#include "3rd_party/threadpool.h"
#include "optimizers/optimizers.h"
#include "training/training.h"
#include "training/validator.h"
namespace marian {
class Reporter {
public:
Ptr<Config> options_;
float costSum{0};
size_t epochs{1};
size_t samples{0};
size_t wordsDisp{0};
size_t batches{0};
boost::timer::cpu_timer timer;
public:
Reporter(Ptr<Config> options) : options_(options) {}
void update(float cost, Ptr<data::CorpusBatch> batch) {
static std::mutex sMutex;
std::lock_guard<std::mutex> guard(sMutex);
costSum += cost;
samples += batch->size();
wordsDisp += batch->words();
batches++;
//if(options.get<size_t>("after-batches")
// && batches >= options.get<size_t>("after-batches"))
// break;
if(batches % options_->get<size_t>("disp-freq") == 0) {
std::stringstream ss;
ss << "Ep. " << epochs
<< " : Up. " << batches
<< " : Sen. " << samples
<< " : Cost " << std::fixed << std::setprecision(2)
<< costSum / options_->get<size_t>("disp-freq")
<< " : Time " << timer.format(2, "%ws");
float seconds = std::stof(timer.format(5, "%w"));
float wps = wordsDisp / (float)seconds;
ss << " : " << std::fixed << std::setprecision(2)
<< wps << " words/s";
LOG(info) << ss.str();
timer.start();
costSum = 0;
wordsDisp = 0;
}
}
};
class GraphGroup {
protected:
Ptr<Config> options_;
Ptr<Reporter> reporter_;
Ptr<OptimizerBase> opt_;
std::vector<Ptr<ExpressionGraph>> graphs_;
public:
GraphGroup(Ptr<Config> options)
: options_(options) {
Ptr<ClipperBase> clipper = nullptr;
float clipNorm = options_->get<double>("clip-norm");
float lrate = options_->get<double>("lrate");
if(clipNorm > 0)
clipper = Clipper<Norm>(clipNorm);
opt_ = Optimizer<Adam>(lrate,
keywords::clip=clipper);
}
: options_(options), opt_(Optimizer(options)) { }
virtual void update(Ptr<data::CorpusBatch>) = 0;
virtual void setReporter(Ptr<Reporter> reporter) {
reporter_ = reporter;
reporter_ = reporter;
}
virtual void load() = 0;
virtual void save() = 0;
};
template <class Builder>
class AsynchronousGraphGroup : public GraphGroup {
class AsyncGraphGroup : public GraphGroup {
private:
Ptr<Builder> builder_;
std::vector<Ptr<Builder>> builders_;
std::vector<size_t> devices_;
ThreadPool pool_;
std::vector<Ptr<ExpressionGraph>> graphs_;
std::mutex sync_;
Tensor params_;
Ptr<TensorAllocator> paramsAlloc_;
Tensor grads_;
Ptr<TensorAllocator> gradsAlloc_;
std::vector<std::mutex> shardSync_;
std::vector<Tensor> params_;
std::vector<Ptr<TensorAllocator> > paramsAlloc_;
std::vector<Tensor> grads_;
std::vector<Ptr<TensorAllocator>> gradsAlloc_;
std::vector<Ptr<OptimizerBase>> shardOpt_;
int shardSize_;
ThreadPool pool_;
void fetchParams(Tensor oldParams) {
if(graphs_.size() < 2)
return;
// @TODO read guard on parameters
std::lock_guard<std::mutex> guard(sync_);
oldParams->copyFrom(params_);
int pos = 0;
std::vector<std::thread> threads;
for (int idx = 0; idx < devices_.size(); idx++) {
threads.emplace_back( std::thread( [=](int idx, int pos) {
//individual mutex per-shard
std::lock_guard<std::mutex> guard( shardSync_[idx] );
oldParams->subtensor(pos , params_[idx]->size())->copyFrom(params_[idx]);
}, idx, pos) );
pos += shardSize_;
}
for (auto &&t : threads) {
t.join();
}
}
void pushGradients(Tensor newGrads) {
if(graphs_.size() < 2) {
opt_->update(graphs_[0]);
}
else {
std::lock_guard<std::mutex> guard(sync_);
grads_->copyFrom(newGrads);
opt_->update(params_, grads_);
// add instead of copy?
std::vector<std::thread> threads;
int pos = 0;
for (int idx = 0; idx < devices_.size(); idx++) {
threads.emplace_back( std::thread([=](int idx, int pos) {
//individual mutex per-shard
std::lock_guard<std::mutex> guard( shardSync_[idx] );
grads_[idx]->copyFrom( newGrads->subtensor(pos , grads_[idx]->size() ) );
shardOpt_[idx]->update(params_[idx], grads_[idx]);
cudaStreamSynchronize(0);
} , idx, pos) );
pos += shardSize_;
}
for(auto&& t : threads)
t.join();
}
}
void execute(Ptr<data::CorpusBatch> batch) {
static bool first = true;
if(first && graphs_.size() > 1) {
// initialize the paramters
for(auto graph : graphs_) {
builder_->build(graph, batch);
graph->forward();
// initialize the parameters
for(size_t i = 0; i < graphs_.size(); ++i) {
builders_[i]->build(graphs_[i], batch);
graphs_[i]->forward();
}
if(!params_) {
paramsAlloc_ = New<TensorAllocator>(graphs_[0]->getDevice());
if(params_.size() == 0) {
int totalSize = graphs_[0]->params().vals()->size();
paramsAlloc_->reserveExact(totalSize);
paramsAlloc_->allocate(params_, {1, totalSize});
shardSize_ = ceil(totalSize / devices_.size());
int pos = 0;
//parameter sharding
for (auto device : devices_){
int __size__ = min(shardSize_, totalSize);
totalSize -= __size__;
Tensor param_;
Ptr<TensorAllocator> allocator_ = New<TensorAllocator>(device);
allocator_->reserveExact(__size__);
allocator_->allocate(param_, {1, __size__});
paramsAlloc_.push_back(allocator_);
param_->copyFrom( graphs_[0]->params().vals()->subtensor( pos , __size__ ) );
params_.push_back(param_);
pos += __size__;
}
}
if(!grads_) {
gradsAlloc_ = New<TensorAllocator>(graphs_[0]->getDevice());
if(grads_.size() == 0) {
int totalSize = graphs_[0]->params().vals()->size();
gradsAlloc_->reserveExact(totalSize);
gradsAlloc_->allocate(grads_, {1, totalSize});
for (auto device : devices_){
int __size__ = min(shardSize_, totalSize);
totalSize -= __size__;
Tensor grad_;
Ptr<TensorAllocator> allocator_ = New<TensorAllocator>(device);
allocator_->reserveExact(__size__);
allocator_->allocate(grad_, {1, __size__});
gradsAlloc_.push_back(allocator_);
grads_.push_back(grad_);
}
}
params_->copyFrom(graphs_[0]->params().vals());
first = false;
}
auto task = [this](Ptr<data::CorpusBatch> batch) {
static size_t i = 0;
thread_local Ptr<ExpressionGraph> graph;
thread_local Ptr<Builder> builder;
thread_local size_t t = 0;
if(!graph) {
std::lock_guard<std::mutex> lock(sync_);
graph = graphs_[i++];
graph = graphs_[i];
builder = builders_[i++];
}
builder_->build(graph, batch);
builder->build(graph, batch);
fetchParams(graph->params().vals());
graph->forward();
float cost = graph->topNode()->scalar();
graph->backward();
cudaStreamSynchronize(0);
pushGradients(graph->params().grads());
if(reporter_) {
std::lock_guard<std::mutex> guard(sync_);
reporter_->update(cost, batch);
if(reporter_->batches % options_->get<size_t>("save-freq") == 0)
this->save();
size_t prevStalled = reporter_->stalled();
reporter_->validate(graph);
if(prevStalled < reporter_->stalled())
for(auto opt : shardOpt_)
opt->updateSchedule();
}
t++;
};
pool_.enqueue(task, batch);
}
public:
AsynchronousGraphGroup(Ptr<Config> options)
: GraphGroup(options),
builder_{New<Builder>(options_)},
devices_{options_->get<std::vector<size_t>>("device")},
pool_{devices_.size(), devices_.size() } {
for(auto device : devices_) {
graphs_.emplace_back(New<ExpressionGraph>());
graphs_.back()->setDevice(device);
graphs_.back()->reserveWorkspaceMB(options_->get<size_t>("workspace"));
}
void load() {
if(options_->has("init")) {
std::string init = options_->get<std::string>("init");
size_t i = 0;
for(auto graph : graphs_)
builders_[i++]->load(graph, init);
}
}
public:
typedef Builder builder_type;
AsyncGraphGroup(Ptr<Config> options)
: GraphGroup(options),
devices_{options_->get<std::vector<size_t>>("device")},
pool_{devices_.size(), devices_.size()},
shardSync_{devices_.size()} {
for(auto device : devices_) {
auto graph = New<ExpressionGraph>();
graph->setDevice(device);
graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
graphs_.push_back(graph);
shardOpt_.push_back(Optimizer(options_));
builders_.push_back(New<Builder>(options_));
}
load();
}
void update(Ptr<data::CorpusBatch> batch) {
execute(batch);
}
void save() {
std::lock_guard<std::mutex> guard(sync_);
if(options_->get<bool>("overwrite")) {
std::string name = options_->get<std::string>("model") + ".npz";
builder_->save(graphs_[0], name);
builders_[0]->save(graphs_[0], name);
}
else {
std::string name = options_->get<std::string>("model")
+ "." + std::to_string(reporter_->batches) + ".npz";
builder_->save(graphs_[0], name);
builders_[0]->save(graphs_[0], name);
}
}
};
template <class Builder>
class SynchronousGraphGroup : public GraphGroup {
class SyncGraphGroup : public GraphGroup {
private:
Ptr<Builder> builder_;
std::vector<Ptr<data::CorpusBatch>> batches_;
bool first_{true};
void accumulateGradients(Ptr<ExpressionGraph> master,
std::vector<Ptr<ExpressionGraph>> graphs) {
if(graphs_.size() < 2) {
return;
}
Tensor grads = master->params().grads();
Tensor tempGrads;
master->tensor(tempGrads, grads->shape());
for(auto graph : graphs) {
if(graph != master) {
Tensor remoteGrads = graph->params().grads();
@ -244,24 +269,24 @@ class SynchronousGraphGroup : public GraphGroup {
Element(_1 += _2, grads, tempGrads);
}
}
float denom = graphs_.size();
Element(_1 /= denom, grads);
}
void distributeParameters(Ptr<ExpressionGraph> master,
std::vector<Ptr<ExpressionGraph>> graphs) {
if(graphs_.size() < 2)
return;
Tensor params = master->params().vals();
Tensor params = master->params().vals();
for(auto graph : graphs) {
if(graph != master) {
graph->params().vals()->copyFrom(params);
}
}
}
void execute() {
if(first_) {
for(auto graph : graphs_) {
@ -271,66 +296,77 @@ class SynchronousGraphGroup : public GraphGroup {
distributeParameters(graphs_[0], graphs_);
first_ = false;
}
auto task = [this](int i,
Ptr<data::CorpusBatch> batch) {
thread_local int j = -1;
if(j == -1)
j = i;
auto localGraph = this->graphs_[j];
builder_->build(localGraph, batch);
localGraph->forward();
float cost = localGraph->topNode()->scalar();
localGraph->backward();
if(reporter_) {
reporter_->update(cost, batch);
if(reporter_->batches % options_->get<size_t>("save-freq") == 0)
this->save();
}
};
{
size_t workers = graphs_.size();
ThreadPool pool(workers, workers);
for(int i = 0; i < batches_.size(); ++i)
pool.enqueue(task, i % (int)workers, batches_[i]);
}
}
accumulateGradients(graphs_[0], graphs_);
opt_->update(graphs_[0]);
distributeParameters(graphs_[0], graphs_);
batches_.clear();
}
void load() {
if(options_->has("init")) {
std::string init = options_->get<std::string>("init");
for(auto graph : graphs_)
builder_->load(graph, init);
}
}
public:
SynchronousGraphGroup(Ptr<Config> options)
typedef Builder builder_type;
SyncGraphGroup(Ptr<Config> options)
: GraphGroup(options),
builder_{New<Builder>(options_)} {
auto devices = options_->get<std::vector<size_t>>("device");
size_t workers = devices.size();
for(auto device : devices) {
graphs_.emplace_back(New<ExpressionGraph>());
graphs_.back()->setDevice(device);
graphs_.back()->reserveWorkspaceMB(options_->get<size_t>("workspace"));
}
load();
}
~SynchronousGraphGroup() {
~SyncGraphGroup() {
execute();
}
void update(Ptr<data::CorpusBatch> batch) {
batches_.push_back(batch);
if(batches_.size() == graphs_.size())
execute();
}
void save() {
if(options_->get<bool>("overwrite")) {
std::string name = options_->get<std::string>("model") + ".npz";
@ -342,6 +378,7 @@ class SynchronousGraphGroup : public GraphGroup {
builder_->save(graphs_[0], name);
}
}
};
}
}

152
src/training/training.h Normal file
View File

@ -0,0 +1,152 @@
#pragma once
#include "data/batch_generator.h"
#include "data/corpus.h"
#include "training/config.h"
#include "training/validator.h"
namespace marian {
class Reporter {
public:
Ptr<Config> options_;
std::vector<Ptr<Validator>> validators_;
float costSum{0};
size_t epochs{1};
size_t samples{0};
size_t wordsDisp{0};
size_t batches{0};
boost::timer::cpu_timer timer;
public:
Reporter(Ptr<Config> options) : options_(options) {}
bool keepGoing() {
// stop if it reached the maximum number of epochs
if(options_->get<size_t>("after-epochs") > 0
&& epochs > options_->get<size_t>("after-epochs"))
return false;
// stop if it reached the maximum number of batch updates
if(options_->get<size_t>("after-batches") > 0
&& batches >= options_->get<size_t>("after-batches"))
return false;
// stop if the first validator did not improve for a given number of checks
if(options_->get<size_t>("early-stopping") > 0
&& !validators_.empty()
&& validators_[0]->stalled() >= options_->get<size_t>("early-stopping"))
return false;
return true;
}
void increaseEpoch() {
LOG(info) << "Seen " << samples << " samples";
epochs++;
samples = 0;
LOG(info) << "Starting epoch " << epochs;
}
void finished() {
LOG(info) << "Training finshed";
}
void addValidator(Ptr<Validator> validator) {
validators_.push_back(validator);
}
void validate(Ptr<ExpressionGraph> graph) {
if(batches % options_->get<size_t>("valid-freq") == 0) {
for(auto validator : validators_) {
if(validator) {
size_t stalledPrev = validator->stalled();
float value = validator->validate(graph);
std::stringstream ss;
ss << batches << " : ";
ss << validator->type() << " : " << value;
if(validator->stalled() > 0)
ss << " : stalled " << validator->stalled() << " times";
else
ss << " : new best";
LOG(valid) << ss.str();
}
}
}
}
size_t stalled() {
for(auto validator : validators_)
if(validator)
return validator->stalled();
return 0;
}
void update(float cost, Ptr<data::CorpusBatch> batch) {
costSum += cost;
samples += batch->size();
wordsDisp += batch->words();
batches++;
if(batches % options_->get<size_t>("disp-freq") == 0) {
std::stringstream ss;
ss << "Ep. " << epochs
<< " : Up. " << batches
<< " : Sen. " << samples
<< " : Cost " << std::fixed << std::setprecision(2)
<< costSum / options_->get<size_t>("disp-freq")
<< " : Time " << timer.format(2, "%ws");
float seconds = std::stof(timer.format(5, "%w"));
float wps = wordsDisp / (float)seconds;
ss << " : " << std::fixed << std::setprecision(2)
<< wps << " words/s";
LOG(info) << ss.str();
timer.start();
costSum = 0;
wordsDisp = 0;
}
}
};
template <class Model>
void Train(Ptr<Config> options) {
using namespace data;
using namespace keywords;
auto trainCorpus = New<Corpus>(options);
auto batchGenerator = New<BatchGenerator<Corpus>>(trainCorpus,
options);
auto reporter = New<Reporter>(options);
if(options->has("valid-sets") && options->get<size_t>("valid-freq") > 0) {
for(auto validator : Validators<typename Model::builder_type>(trainCorpus->getVocabs(),
options))
reporter->addValidator(validator);
}
auto model = New<Model>(options);
model->setReporter(reporter);
while(reporter->keepGoing()) {
batchGenerator->prepare(!options->get<bool>("no-shuffle"));
while(*batchGenerator && reporter->keepGoing()) {
auto batch = batchGenerator->next();
model->update(batch);
}
if(reporter->keepGoing())
reporter->increaseEpoch();
}
reporter->finished();
model->save();
}
}

146
src/training/validator.h Normal file
View File

@ -0,0 +1,146 @@
#pragma once
#include <limits>
#include "training/config.h"
#include "graph/expression_graph.h"
#include "data/corpus.h"
#include "data/batch_generator.h"
namespace marian {
class Validator {
protected:
Ptr<Config> options_;
std::vector<Ptr<Vocab>> vocabs_;
float lastBest_;
size_t stalled_{0};
public:
Validator(std::vector<Ptr<Vocab>> vocabs,
Ptr<Config> options)
: options_(options),
vocabs_(vocabs),
lastBest_{lowerIsBetter() ?
std::numeric_limits<float>::max() :
std::numeric_limits<float>::lowest() } {
}
virtual std::string type() = 0;
virtual bool lowerIsBetter() {
return true;
}
size_t stalled() {
return stalled_;
}
float validate(Ptr<ExpressionGraph> graph) {
using namespace data;
auto validPaths = options_->get<std::vector<std::string>>("valid-sets");
auto corpus = New<Corpus>(validPaths, vocabs_, options_);
Ptr<BatchGenerator<Corpus>> batchGenerator
= New<BatchGenerator<Corpus>>(corpus, options_);
batchGenerator->prepare(false);
float val = validate(graph, batchGenerator);
if(lowerIsBetter() && lastBest_ > val ||
!lowerIsBetter() && lastBest_ < val) {
stalled_ = 0;
lastBest_ = val;
}
else {
stalled_++;
}
return val;
};
virtual float validate(Ptr<ExpressionGraph>,
Ptr<data::BatchGenerator<data::Corpus>>) = 0;
};
template <class Builder>
class CrossEntropyValidator : public Validator {
private:
Ptr<Builder> builder_;
public:
CrossEntropyValidator(std::vector<Ptr<Vocab>> vocabs,
Ptr<Config> options)
: Validator(vocabs, options),
builder_(New<Builder>(options)) {}
float validate(Ptr<ExpressionGraph> graph,
Ptr<data::BatchGenerator<data::Corpus>> batchGenerator) {
float cost = 0;
size_t samples = 0;
while(*batchGenerator) {
auto batch = batchGenerator->next();
builder_->build(graph, batch);
graph->forward();
cost += graph->topNode()->scalar() * batch->size();
samples += batch->size();
}
return cost / samples;
}
std::string type() { return "cross-entropy"; }
};
template <class Builder>
class PerplexityValidator : public Validator {
private:
Ptr<Builder> builder_;
public:
PerplexityValidator(std::vector<Ptr<Vocab>> vocabs,
Ptr<Config> options)
: Validator(vocabs, options),
builder_(New<Builder>(options)) {}
float validate(Ptr<ExpressionGraph> graph,
Ptr<data::BatchGenerator<data::Corpus>> batchGenerator) {
float cost = 0;
size_t words = 0;
while(*batchGenerator) {
auto batch = batchGenerator->next();
builder_->build(graph, batch);
graph->forward();
cost += graph->topNode()->scalar() * batch->size();
words += batch->words();
}
return expf(cost / words);
}
std::string type() { return "perplexity"; }
};
template <class Builder>
std::vector<Ptr<Validator>> Validators(std::vector<Ptr<Vocab>> vocabs,
Ptr<Config> options) {
std::vector<Ptr<Validator>> validators;
auto validMetrics = options->get<std::vector<std::string>>("valid-metrics");
for(auto metric : validMetrics) {
if(metric == "cross-entropy") {
auto validator = New<CrossEntropyValidator<Builder>>(vocabs, options);
validators.push_back(validator);
}
if(metric == "perplexity") {
auto validator = New<PerplexityValidator<Builder>>(vocabs, options);
validators.push_back(validator);
}
}
return validators;
}
}

View File

@ -0,0 +1,360 @@
#include <iostream>
#include "translator/nth_element.h"
namespace marian {
void HandleError(cudaError_t err, const char *file, int line ) {
if (err != cudaSuccess) {
std::cerr << "ERROR: " << cudaGetErrorString(err) << " in " << file << " at line " << line << std::endl;
exit( EXIT_FAILURE );
}
}
#define UNROLL_MAXARG_LOOP( n, max ) \
if (tid < (n) && tid + (n) < ( max ) ) { \
if (sdata[tid + ( n ) ] > sdata[tid]) { \
sdata[tid] = sdata[tid + ( n ) ]; \
indices[tid] = indices[tid + ( n ) ]; \
} \
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
__global__ void gMaxElement(float* d_out, int* d_ind, float* d_in, int numBatches, int* batchFirstElementIdxs) {
extern __shared__ float sdata[];
__shared__ int indices[512];
int tid = threadIdx.x;
for (int batchIdx = 0; batchIdx < numBatches; ++batchIdx) {
int begin = batchFirstElementIdxs[batchIdx];
int end = batchFirstElementIdxs[batchIdx + 1];
int i = begin + blockIdx.x * (blockDim.x * 2) + tid;
sdata[tid] = -3.40282e+38f;
if (i < end) {
sdata[tid] = d_in[i];
indices[tid] = i;
}
if (i + blockDim.x < end) {
float a = d_in[i];
float b = d_in[i + blockDim.x];
if (a > b) {
sdata[tid] = a;
indices[tid] = i;
} else {
sdata[tid] = b;
indices[tid] = i + blockDim.x;
}
}
while (i + 2 * gridDim.x * blockDim.x < end) {
i += 2 * gridDim.x * blockDim.x;
float a = d_in[i];
if (a > sdata[tid]) {
sdata[tid] = a;
indices[tid] = i;
}
if (i + blockDim.x < end) {
float b = d_in[i + blockDim.x];
if (b > sdata[tid]) {
sdata[tid] = b;
indices[tid] = i + blockDim.x;
}
}
}
__syncthreads();
for (int s = (blockDim.x >> 1); s > 32; s >>= 1) {
if (tid < s && tid + s < end) {
if (sdata[tid + s] > sdata[tid]) {
sdata[tid] = sdata[tid + s];
indices[tid] = indices[tid + s];
}
}
__syncthreads();
}
UNROLL_MAXARG_LOOP(32, end);
UNROLL_MAXARG_LOOP(16, end);
UNROLL_MAXARG_LOOP(8, end);
UNROLL_MAXARG_LOOP(4, end);
UNROLL_MAXARG_LOOP(2, end);
UNROLL_MAXARG_LOOP(1, end);
if (tid == 0) {
d_out[blockIdx.x + batchIdx * gridDim.x] = sdata[0];
d_ind[blockIdx.x + batchIdx * gridDim.x] = indices[0];
}
__syncthreads();
}
}
__global__ void gMaxElementUpdate(float* binCosts, int* binIdxs, float* probs, int *batchFirstElements, float* outCosts, int* outIdxs, int *cummulatedBeamSizes, int NUM_BLOCKS) {
extern __shared__ float sdata[];
__shared__ int indices[512];
__shared__ float bestBinCost;
__shared__ int bestBinCostIdx;
const int tid = threadIdx.x;
const int batchIdx = blockIdx.x;
const int N = batchFirstElements[batchIdx + 1] - batchFirstElements[batchIdx];
int num_bins = int(N / (2 * 512)) + int(N % (2 * 512) != 0);
if (num_bins > 500) {
num_bins = 500;
}
for (int pos = cummulatedBeamSizes[batchIdx]; pos < cummulatedBeamSizes[batchIdx + 1]; ++pos) {
int i = tid;
sdata[tid] = -3.40282e+38f;
if (i < num_bins) {
sdata[tid] = binCosts[batchIdx * NUM_BLOCKS + i];
indices[tid] = i;
}
if (i + blockDim.x < num_bins) {
float a = binCosts[batchIdx * NUM_BLOCKS + i];
float b = binCosts[batchIdx * NUM_BLOCKS + i + blockDim.x];
if (a > b) {
sdata[tid] = a;
indices[tid] = i;
} else {
sdata[tid] = b;
indices[tid] = i + blockDim.x;
}
}
while (i + 2 * blockDim.x < num_bins) {
i += 2 * blockDim.x;
float a = binCosts[batchIdx * NUM_BLOCKS + i];
if (a > sdata[tid]) {
sdata[tid] = a;
indices[tid] = i;
}
if (i + blockDim.x < num_bins) {
float b = binCosts[batchIdx * NUM_BLOCKS + i + blockDim.x];
if (b > sdata[tid]) {
sdata[tid] = b;
indices[tid] = i + blockDim.x;
}
}
}
__syncthreads();
for (int s = (blockDim.x >> 1); s > 32; s >>= 1) {
if (tid < s && tid + s < num_bins) {
if (sdata[tid + s] > sdata[tid]) {
sdata[tid] = sdata[tid + s];
indices[tid] = indices[tid + s];
}
}
__syncthreads();
}
UNROLL_MAXARG_LOOP(32, num_bins);
UNROLL_MAXARG_LOOP(16, num_bins);
UNROLL_MAXARG_LOOP(8, num_bins);
UNROLL_MAXARG_LOOP(4, num_bins);
UNROLL_MAXARG_LOOP(2, num_bins);
UNROLL_MAXARG_LOOP(1, num_bins);
if (tid == 0) {
bestBinCost = sdata[0];
bestBinCostIdx = batchIdx * NUM_BLOCKS + indices[0];
probs[binIdxs[bestBinCostIdx]] = -3.40282e+38f;
outIdxs[pos] = binIdxs[bestBinCostIdx];
outCosts[pos] = bestBinCost;
}
__syncthreads();
i = batchFirstElements[batchIdx] + (bestBinCostIdx - batchIdx * NUM_BLOCKS) * (blockDim.x * 2) + tid;
const int dist = num_bins * 2 * blockDim.x;
sdata[tid] = -3.40282e+38f;
if (i < batchFirstElements[batchIdx + 1]) {
sdata[tid] = probs[i];
indices[tid] = i;
}
if (i + blockDim.x < batchFirstElements[batchIdx + 1]) {
float a = probs[i];
float b = probs[i+blockDim.x];
if (a > b) {
sdata[tid] = a;
indices[tid] = i;
} else {
sdata[tid] = b;
indices[tid] = i + blockDim.x;
}
}
while (i + dist < batchFirstElements[batchIdx + 1]) {
i += dist;
float a = probs[i];
if (a > sdata[tid]) {
sdata[tid] = a;
indices[tid] = i;
}
if (i + blockDim.x < batchFirstElements[batchIdx + 1]) {
float b = probs[i + blockDim.x];
if (b > sdata[tid]) {
sdata[tid] = b;
indices[tid] = i + blockDim.x;
}
}
}
__syncthreads();
for (int s = (blockDim.x >> 1); s > 32; s >>= 1) {
if (tid < s && tid + s < batchFirstElements[batchIdx + 1]) {
if (sdata[tid + s] > sdata[tid]) {
sdata[tid] = sdata[tid + s];
indices[tid] = indices[tid + s];
}
}
__syncthreads();
}
UNROLL_MAXARG_LOOP(32, batchFirstElements[batchIdx + 1]);
UNROLL_MAXARG_LOOP(16, batchFirstElements[batchIdx + 1]);
UNROLL_MAXARG_LOOP(8, batchFirstElements[batchIdx + 1]);
UNROLL_MAXARG_LOOP(4, batchFirstElements[batchIdx + 1]);
UNROLL_MAXARG_LOOP(2, batchFirstElements[batchIdx + 1]);
UNROLL_MAXARG_LOOP(1, batchFirstElements[batchIdx + 1]);
if (tid == 0) {
binCosts[bestBinCostIdx] = sdata[0];
binIdxs[bestBinCostIdx] = indices[0];
}
__syncthreads();
}
}
__global__ void gGetValueByKey(float* d_in, float* d_out, int* indeces, int n)
{
int tid = threadIdx.x + blockDim.x * blockIdx.x;
if (tid < n) {
int index = indeces[tid];
d_out[tid] = d_in[index];
}
}
NthElement::NthElement(size_t maxBeamSize, size_t maxBatchSize, cudaStream_t stream)
: stream_(stream) ,
NUM_BLOCKS(std::min(500, int(maxBeamSize * 85000 / (2 * BLOCK_SIZE)) + int(maxBeamSize * 85000 % (2 * BLOCK_SIZE) != 0)))
{
//std::cerr << "NthElement::NthElement" << std::endl;
HANDLE_ERROR( cudaMalloc((void**)&d_ind, maxBatchSize * NUM_BLOCKS * sizeof(int)) );
HANDLE_ERROR( cudaMalloc((void**)&d_out, maxBatchSize * NUM_BLOCKS * sizeof(float)) );
HANDLE_ERROR( cudaMalloc((void**)&d_res_idx, maxBatchSize * maxBeamSize * sizeof(int)) );
HANDLE_ERROR( cudaMalloc((void**)&d_res, maxBatchSize * maxBeamSize * sizeof(float)) );
HANDLE_ERROR( cudaHostAlloc((void**) &h_res, maxBeamSize * maxBatchSize* sizeof(float),
cudaHostAllocDefault) );
HANDLE_ERROR( cudaHostAlloc((void**) &h_res_idx, maxBeamSize * maxBatchSize * sizeof(int),
cudaHostAllocDefault) );
HANDLE_ERROR( cudaMalloc((void**)&d_breakdown, maxBeamSize * sizeof(float)) );
HANDLE_ERROR( cudaMalloc((void**)&d_batchPosition, (maxBatchSize + 1) * sizeof(int)) );
HANDLE_ERROR( cudaMalloc((void**)&d_cumBeamSizes, (maxBatchSize + 1) * sizeof(int)) );
}
NthElement::~NthElement()
{
HANDLE_ERROR(cudaFree(d_ind));
HANDLE_ERROR(cudaFree(d_out));
HANDLE_ERROR(cudaFree(d_res_idx));
HANDLE_ERROR(cudaFree(d_res));
HANDLE_ERROR(cudaFreeHost(h_res));
HANDLE_ERROR(cudaFreeHost(h_res_idx));
HANDLE_ERROR(cudaFree(d_breakdown));
HANDLE_ERROR(cudaFree(d_batchPosition));
HANDLE_ERROR(cudaFree(d_cumBeamSizes));
}
void NthElement::getNBestList(float* probs, const std::vector<int>& batchFirstElementIdxs,
const std::vector<int>& cummulatedBeamSizes)
{
HANDLE_ERROR( cudaMemcpyAsync(d_batchPosition, batchFirstElementIdxs.data(), batchFirstElementIdxs.size() * sizeof(int),
cudaMemcpyHostToDevice, stream_) );
HANDLE_ERROR( cudaMemcpyAsync(d_cumBeamSizes, cummulatedBeamSizes.data(), cummulatedBeamSizes.size() * sizeof(int),
cudaMemcpyHostToDevice, stream_) );
const int numBatches = batchFirstElementIdxs.size() - 1;
gMaxElement<<<NUM_BLOCKS, BLOCK_SIZE, BLOCK_SIZE * sizeof(float), stream_>>>
(d_out, d_ind, probs, numBatches, d_batchPosition);
gMaxElementUpdate<<<numBatches, BLOCK_SIZE, BLOCK_SIZE * sizeof(float), stream_>>>
(d_out, d_ind, probs, d_batchPosition, d_res, d_res_idx, d_cumBeamSizes, NUM_BLOCKS);
}
void NthElement::getNBestList(const std::vector<size_t>& beamSizes, Tensor Probs,
std::vector<float>& outCosts, std::vector<unsigned>& outKeys,
const bool isFirst) {
std::vector<int> cummulatedBeamSizes(beamSizes.size() + 1, 0);
std::vector<int> batchFirstElementIdxs(beamSizes.size() + 1, 0);
const size_t vocabSize = Probs->shape()[1];
for (size_t i = 0; i < beamSizes.size(); ++i) {
cummulatedBeamSizes[i + 1] = cummulatedBeamSizes[i] + beamSizes[i];
batchFirstElementIdxs[i + 1] += ((isFirst) ? (i + 1) : cummulatedBeamSizes[i + 1]) * vocabSize;
}
getNBestList(Probs->data(), batchFirstElementIdxs, cummulatedBeamSizes);
GetPairs(cummulatedBeamSizes.back(), outKeys, outCosts);
}
void NthElement::GetPairs(size_t number,
std::vector<unsigned>& outKeys,
std::vector<float>& outValues) {
HANDLE_ERROR( cudaMemcpyAsync(h_res, d_res, number * sizeof(float),
cudaMemcpyDeviceToHost, stream_) );
HANDLE_ERROR( cudaMemcpyAsync(h_res_idx, d_res_idx, number * sizeof(int),
cudaMemcpyDeviceToHost, stream_) );
cudaStreamSynchronize(stream_);
for (size_t i = 0; i < number; ++i) {
outKeys.push_back(h_res_idx[i]);
outValues.push_back(h_res[i]);
}
lastN = number;
}
void NthElement::getValueByKey(std::vector<float>& out, float* d_in) {
gGetValueByKey<<<1, lastN, 0, stream_>>>
(d_in, d_breakdown, h_res_idx, lastN);
HANDLE_ERROR( cudaMemcpyAsync(out.data(), d_breakdown, lastN * sizeof(float),
cudaMemcpyDeviceToHost, stream_) );
HANDLE_ERROR( cudaStreamSynchronize(stream_));
}
}

View File

@ -0,0 +1,51 @@
#pragma once
#include <vector>
#include <algorithm>
#include <cuda.h>
#include "tensors/tensor.h"
namespace marian {
class NthElement {
public:
NthElement() = delete;
NthElement(const NthElement &copy) = delete;
NthElement(size_t maxBeamSize, size_t maxBatchSize, cudaStream_t stream);
virtual ~NthElement();
void getNBestList(float* probs, const std::vector<int>& batchFirstElementIdxs,
const std::vector<int>& cummulatedBeamSizes);
void getNBestList(const std::vector<size_t>& beamSizes, Tensor Probs,
std::vector<float>& outCosts, std::vector<unsigned>& outKeys,
const bool isFirst=false);
void GetPairs(size_t number,
std::vector<unsigned>& outKeys,
std::vector<float>& outValues);
void getValueByKey(std::vector<float>& out, float* d_in);
private:
const int BLOCK_SIZE = 512;
const int NUM_BLOCKS;
cudaStream_t stream_;
int *d_ind;
float *d_out;
int *d_res_idx;
float *d_res;
int *h_res_idx;
float *h_res;
float *d_breakdown;
int *d_batchPosition;
int *d_cumBeamSizes;
size_t lastN;
};
}