diff --git a/.gitignore b/.gitignore index f78c8028..53468680 100644 --- a/.gitignore +++ b/.gitignore @@ -40,6 +40,3 @@ build # Examples examples/*/*.gz examples/mnist/*ubyte - -.cproject -.project \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 28287da1..0ad27c34 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,7 +5,7 @@ project(marian CXX) find_package(CUDA "8.0" REQUIRED) SET(CMAKE_CXX_FLAGS " -std=c++11 -g -O3 -Wno-unused-result -Wno-deprecated -fPIC -Wno-deprecated-gpu-targets") -LIST(APPEND CUDA_NVCC_FLAGS -std=c++11; --default-stream per-thread; -g; -O3; --use_fast_math; -Xcompiler '-fPIC'; -arch=sm_35; -DCUDNN) +LIST(APPEND CUDA_NVCC_FLAGS -std=c++11; --default-stream per-thread; -g; -O3; --use_fast_math; -Xcompiler '-fPIC'; -arch=sm_35;) SET(CUDA_PROPAGATE_HOST_FLAGS OFF) include_directories(${amunn_SOURCE_DIR}) diff --git a/README.md b/README.md index 502df966..c4b5fa1a 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,8 @@ Marian ====== -[![Join the chat at https://gitter.im/MarianNMT/Lobby](https://badges.gitter.im/MarianNMT/Lobby.svg)](https://gitter.im/MarianNMT/Lobby?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) +[![Join the chat at https://gitter.im/amunmt/marian](https://badges.gitter.im/amunmt/marian.svg)](https://gitter.im/amunmt/marian?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) +[![Build Status](http://vali.inf.ed.ac.uk/jenkins/buildStatus/icon?job=Marian)](http://vali.inf.ed.ac.uk/jenkins/job/Marian/) Google group for commit messages: https://groups.google.com/forum/#!forum/mariannmt @@ -17,30 +18,12 @@ Installation Requirements: * g++ with c++11 -* CUDA and CuDNN +* CUDA * Boost (>= 1.56) -Exporting some paths for CuDNN may be required (put it, for example, in your `.bashrc` file): - - export PATH=$PATH:$HOME/.local/bin:/usr/local/cuda/bin - export LIBRARY_PATH=$LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/local/cudnn-5/lib64 - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/local/cudnn-5/lib64 - export CPATH=$CPATH:/usr/local/cudnn-5/include - Compilation with `cmake > 3.5`: mkdir build cd build cmake .. make -j - -To compile API documentation using Doxygen, first cd to the build directory, and then: - - make doc - -To test, first compile, then: - - cd examples/mnist - make - cd ../../build - ./mnist_benchmark diff --git a/marian/.cproject b/marian/.cproject new file mode 100644 index 00000000..184c39a4 --- /dev/null +++ b/marian/.cproject @@ -0,0 +1,163 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/marian/.project b/marian/.project new file mode 100644 index 00000000..d1163076 --- /dev/null +++ b/marian/.project @@ -0,0 +1,34 @@ + + + marian + + + + + + org.eclipse.cdt.managedbuilder.core.genmakebuilder + clean,full,incremental, + + + + + org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder + full,incremental, + + + + + + org.eclipse.cdt.core.cnature + org.eclipse.cdt.core.ccnature + org.eclipse.cdt.managedbuilder.core.managedBuildNature + org.eclipse.cdt.managedbuilder.core.ScannerConfigNature + + + + src + 2 + PARENT-1-PROJECT_LOC/src + + + diff --git a/src/3rd_party/threadpool.h b/src/3rd_party/threadpool.h index fb77dfe6..1938b95c 100644 --- a/src/3rd_party/threadpool.h +++ b/src/3rd_party/threadpool.h @@ -45,6 +45,7 @@ class ThreadPool { template auto enqueue(F&& f, Args&&... args) -> std::future::type>; + ~ThreadPool(); size_t getNumTasks() const { @@ -128,6 +129,3 @@ inline ThreadPool::~ThreadPool() { worker.join(); } } - - - diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 162c53bd..c56b5c80 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -13,8 +13,13 @@ cuda_add_library(marian_lib graph/node_operators.cu tensors/tensor.cu kernels/tensor_operators.cu + kernels/dropout.cu layers/param_initializers.cpp common/utils.cpp + common/logging.cpp + common/history.cpp + training/config.cpp + translator/nth_element.cu data/vocab.cpp data/corpus.cpp $ @@ -27,30 +32,39 @@ cuda_add_executable( test/tensor_test.cu ) +cuda_add_executable( + marian_translate + test/marian_translate.cu +) + cuda_add_executable( marian_test test/marian_test.cu ) +cuda_add_executable( + bn_test + test/bn_test.cu +) + cuda_add_executable( marian - command/config.cpp command/marian.cu ) cuda_add_executable( dropout_test test/dropout_test.cu - kernels/dropout_cudnn.cu ) target_link_libraries(marian marian_lib) target_link_libraries(tensor_test marian_lib) target_link_libraries(marian_test marian_lib) target_link_libraries(dropout_test marian_lib) +target_link_libraries(marian_translate marian_lib) +target_link_libraries(bn_test marian_lib) -foreach(exec tensor_test marian_test marian dropout_test) - target_link_libraries(${exec} ${EXT_LIBS} cudnn) +foreach(exec dropout_test tensor_test marian_test marian_translate marian bn_test) target_link_libraries(${exec} ${EXT_LIBS} curand) cuda_add_cublas_to_target(${exec}) set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}") diff --git a/src/command/marian.cu b/src/command/marian.cu index dd8b3301..8e255fa2 100644 --- a/src/command/marian.cu +++ b/src/command/marian.cu @@ -1,86 +1,13 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include "marian.h" -#include "optimizers/optimizers.h" -#include "optimizers/clippers.h" -#include "data/batch_generator.h" -#include "data/corpus.h" -#include "models/nematus.h" - -#include "common/logging.h" -#include "command/config.h" -#include "parallel/graph_group.h" - -namespace marian { - - void TrainingLoop(Ptr options, - Ptr> batchGenerator) { - - auto reporter = New(options); - Ptr graphGroup = New>(options); - graphGroup->setReporter(reporter); - - size_t epochs = 1; - size_t batches = 0; - while((options->get("after-epochs") == 0 - || epochs <= options->get("after-epochs")) && - (options->get("after-batches") == 0 - || batches < options->get("after-batches"))) { - - batchGenerator->prepare(!options->get("no-shuffle")); - - boost::timer::cpu_timer timer; - - while(*batchGenerator) { - - auto batch = batchGenerator->next(); - graphGroup->update(batch); - - } - epochs++; - LOG(info) << "Starting epoch " << epochs << " after " - << reporter->samples << " samples"; - } - LOG(info) << "Training finshed"; - graphGroup->save(); - } -} +#include "models/gnmt.h" int main(int argc, char** argv) { using namespace marian; - using namespace data; - using namespace keywords; - std::shared_ptr info; - info = spdlog::stderr_logger_mt("info"); - info->set_pattern("[%Y-%m-%d %T] %v"); - - auto options = New(argc, argv); - std::cerr << *options << std::endl; - - auto dimVocabs = options->get>("dim-vocabs"); - int dimEmb = options->get("dim-emb"); - int dimRnn = options->get("dim-rnn"); - int dimBatch = options->get("mini-batch"); - int dimMaxiBatch = options->get("maxi-batch"); + auto options = New(argc, argv);; - auto trainSets = options->get>("trainsets"); - auto vocabs = options->get>("vocabs"); - size_t maxSentenceLength = options->get("max-length"); - auto corpus = New(trainSets, vocabs, dimVocabs, maxSentenceLength); - auto bg = New>(corpus, dimBatch, dimMaxiBatch); - - TrainingLoop(options, bg); + Train>(options); return 0; } diff --git a/src/common/definitions.h b/src/common/definitions.h index 9e117a2a..997333f0 100644 --- a/src/common/definitions.h +++ b/src/common/definitions.h @@ -30,6 +30,7 @@ #include #include "shape.h" +#include "common/logging.h" namespace marian { @@ -93,7 +94,6 @@ namespace marian { // An enumeration of directions enum struct dir { forward, backward, bidirect }; - /** * @brief Defines a set of keywords. * @@ -101,27 +101,32 @@ namespace marian { * will result in the creation of an instance of the Keyword class. */ namespace keywords { - KEY(axis, int) - KEY(shape, Shape) - KEY(value, float) - KEY(prefix, std::string) - KEY(final, bool) - KEY(output_last, bool) - KEY(activation, act) - KEY(direction, dir) - KEY(mask, Expr) - KEY(init, std::function) + KEY(axis, int); + KEY(shape, Shape); + KEY(value, float); + KEY(prefix, std::string); + KEY(final, bool); + KEY(output_last, bool); + KEY(activation, act); + KEY(direction, dir); + KEY(mask, Expr); + KEY(dropout_prob, float); + KEY(init, std::function); - KEY(eta, float) - KEY(beta1, float) - KEY(beta2, float) - KEY(eps, float) - KEY(optimizer, Ptr) - KEY(clip, Ptr) - KEY(batch_size, int) - KEY(max_epochs, int) - KEY(valid, Ptr) + KEY(eta, float); + KEY(beta1, float); + KEY(beta2, float); + KEY(eps, float); + KEY(optimizer, Ptr); + KEY(clip, Ptr); + KEY(batch_size, int); + KEY(normalize, bool); + KEY(skip, bool); + KEY(skip_first, bool); + KEY(coverage, Expr); + KEY(max_epochs, int); + KEY(valid, Ptr); } } diff --git a/src/common/history.cpp b/src/common/history.cpp new file mode 100644 index 00000000..3d3ad857 --- /dev/null +++ b/src/common/history.cpp @@ -0,0 +1,10 @@ +#include "history.h" + +namespace marian { + +History::History(size_t lineNo) + : normalize_(true), + lineNo_(lineNo) +{} + +} diff --git a/src/common/history.h b/src/common/history.h new file mode 100755 index 00000000..fca8a4b5 --- /dev/null +++ b/src/common/history.h @@ -0,0 +1,79 @@ +#pragma once + +#include + +#include "hypothesis.h" + +namespace marian { + +class History { + private: + struct HypothesisCoord { + bool operator<(const HypothesisCoord& hc) const { + return cost < hc.cost; + } + + size_t i; + size_t j; + float cost; + }; + + public: + History(size_t lineNo); + + void Add(const Beam& beam, bool last = false) { + if (beam.back()->GetPrevHyp() != nullptr) { + for (size_t j = 0; j < beam.size(); ++j) + if(beam[j]->GetWord() == 0 || last) { + float cost = normalize_ ? beam[j]->GetCost() / history_.size() : beam[j]->GetCost(); + topHyps_.push({ history_.size(), j, cost }); + } + } + history_.push_back(beam); + } + + size_t size() const { + return history_.size(); + } + + NBestList NBest(size_t n) const { + NBestList nbest; + auto topHypsCopy = topHyps_; + while (nbest.size() < n && !topHypsCopy.empty()) { + auto bestHypCoord = topHypsCopy.top(); + topHypsCopy.pop(); + + size_t start = bestHypCoord.i; + size_t j = bestHypCoord.j; + + Words targetWords; + Ptr bestHyp = history_[start][j]; + while(bestHyp->GetPrevHyp() != nullptr) { + targetWords.push_back(bestHyp->GetWord()); + bestHyp = bestHyp->GetPrevHyp(); + } + + std::reverse(targetWords.begin(), targetWords.end()); + nbest.emplace_back(targetWords, history_[bestHypCoord.i][bestHypCoord.j]); + } + return nbest; + } + + Result Top() const { + return NBest(1)[0]; + } + + size_t GetLineNum() const + { return lineNo_; } + + private: + std::vector history_; + std::priority_queue topHyps_; + bool normalize_; + size_t lineNo_; + +}; + +typedef std::vector Histories; + +} diff --git a/src/common/hypothesis.h b/src/common/hypothesis.h new file mode 100644 index 00000000..08744566 --- /dev/null +++ b/src/common/hypothesis.h @@ -0,0 +1,58 @@ +#pragma once +#include + +#include "common/definitions.h" + +namespace marian { + +class Hypothesis { + public: + Hypothesis() + : prevHyp_(nullptr), + prevIndex_(0), + word_(0), + cost_(0.0) + {} + + Hypothesis(const Ptr prevHyp, size_t word, size_t prevIndex, float cost) + : prevHyp_(prevHyp), + prevIndex_(prevIndex), + word_(word), + cost_(cost) + {} + + const Ptr GetPrevHyp() const { + return prevHyp_; + } + + size_t GetWord() const { + return word_; + } + + size_t GetPrevStateIndex() const { + return prevIndex_; + } + + float GetCost() const { + return cost_; + } + + std::vector& GetCostBreakdown() { + return costBreakdown_; + } + + private: + const Ptr prevHyp_; + const size_t prevIndex_; + const size_t word_; + const float cost_; + std::vector costBreakdown_; +}; + +typedef std::vector> Beam; +typedef std::vector Beams; +typedef std::vector Words; +typedef std::pair> Result; +typedef std::vector NBestList; + +} diff --git a/src/common/keywords.h b/src/common/keywords.h index db0b5f20..01eb5898 100644 --- a/src/common/keywords.h +++ b/src/common/keywords.h @@ -219,7 +219,7 @@ namespace keywords { */ #define KEY(name, value_type) \ typedef const Keyword name ## _k; \ -name ## _k name; +name ## _k name } diff --git a/src/common/logging.cpp b/src/common/logging.cpp new file mode 100644 index 00000000..8621205e --- /dev/null +++ b/src/common/logging.cpp @@ -0,0 +1,42 @@ +#include "logging.h" +#include "training/config.h" + +std::shared_ptr stderrLogger(const std::string& name, + const std::string& pattern, + const std::vector& files) { + std::vector sinks; + + auto stderr_sink = spdlog::sinks::stderr_sink_mt::instance(); + sinks.push_back(stderr_sink); + + for(auto&& file : files) { + auto file_sink = std::make_shared(file, true); + sinks.push_back(file_sink); + } + + auto logger = std::make_shared(name, begin(sinks), end(sinks)); + + spdlog::register_logger(logger); + logger->set_pattern(pattern); + return logger; +} + +void createLoggers(const marian::Config& options) { + + std::vector generalLogs; + std::vector validLogs; + if(options.has("log")) { + generalLogs.push_back(options.get("log")); + validLogs.push_back(options.get("log")); + } + + if(options.has("valid-log")) { + validLogs.push_back(options.get("valid-log")); + } + + Logger info{stderrLogger("info", "[%Y-%m-%d %T] %v", generalLogs)}; + Logger config{stderrLogger("config", "[%Y-%m-%d %T] [config] %v", generalLogs)}; + Logger memory{stderrLogger("memory", "[%Y-%m-%d %T] [memory] %v", generalLogs)}; + Logger data{stderrLogger("data", "[%Y-%m-%d %T] [data] %v", generalLogs)}; + Logger valid{stderrLogger("valid", "[%Y-%m-%d %T] [valid] %v", validLogs)}; +} diff --git a/src/common/logging.h b/src/common/logging.h index 9959fc44..2583a071 100644 --- a/src/common/logging.h +++ b/src/common/logging.h @@ -3,3 +3,15 @@ #include "spdlog/spdlog.h" #define LOG(logger) spdlog::get(#logger)->info() + +typedef std::shared_ptr Logger; +Logger stderrLogger(const std::string&, const std::string&, + const std::vector& = {}); + +namespace marian { + class Config; +} + +void createLoggers(const marian::Config& options); + + diff --git a/src/data/batch_generator.h b/src/data/batch_generator.h index 3c6832fc..353a4df2 100644 --- a/src/data/batch_generator.h +++ b/src/data/batch_generator.h @@ -5,7 +5,8 @@ #include -#include "dataset.h" +#include "data/dataset.h" +#include "training/config.h" namespace marian { @@ -21,22 +22,24 @@ class BatchGenerator { private: Ptr data_; + Ptr options_; + typename DataSet::iterator current_; - size_t batchSize_; size_t maxiBatchSize_; std::deque bufferedBatches_; BatchPtr currentBatch_; - void fillBatches() { + void fillBatches(bool shuffle=true) { auto cmp = [](const sample& a, const sample& b) { return a[0].size() < b[0].size(); }; std::priority_queue maxiBatch(cmp); - while(current_ != data_->end() && maxiBatch.size() < maxiBatchSize_) { + int maxSize = options_->get("mini-batch") * options_->get("maxi-batch"); + while(current_ != data_->end() && maxiBatch.size() < maxSize) { maxiBatch.push(*current_); current_++; } @@ -45,7 +48,7 @@ class BatchGenerator { while(!maxiBatch.empty()) { batchVector.push_back(maxiBatch.top()); maxiBatch.pop(); - if(batchVector.size() == batchSize_) { + if(batchVector.size() == options_->get("mini-batch")) { bufferedBatches_.push_back(data_->toBatch(batchVector)); batchVector.clear(); } @@ -53,17 +56,15 @@ class BatchGenerator { if(!batchVector.empty()) bufferedBatches_.push_back(data_->toBatch(batchVector)); - std::random_shuffle(bufferedBatches_.begin(), bufferedBatches_.end()); + if(shuffle) + std::random_shuffle(bufferedBatches_.begin(), bufferedBatches_.end()); } public: BatchGenerator(Ptr data, - size_t batchSize=80, - size_t maxiBatchNum=20) + Ptr options) : data_(data), - batchSize_(batchSize), - maxiBatchSize_(batchSize * maxiBatchNum) - { } + options_(options) { } operator bool() const { return !bufferedBatches_.empty(); @@ -84,8 +85,10 @@ class BatchGenerator { void prepare(bool shuffle=true) { if(shuffle) data_->shuffle(); + else + data_->reset(); current_ = data_->begin(); - fillBatches(); + fillBatches(shuffle); } }; diff --git a/src/data/corpus.cpp b/src/data/corpus.cpp index 03da5ea2..6230bd68 100644 --- a/src/data/corpus.cpp +++ b/src/data/corpus.cpp @@ -1,5 +1,6 @@ #include -#include "corpus.h" + +#include "data/corpus.h" namespace marian { namespace data { @@ -33,20 +34,53 @@ const SentenceTuple& CorpusIterator::dereference() const { return tup_; } -Corpus::Corpus(const std::vector& textPaths, - const std::vector& vocabPaths, - const std::vector& maxVocabs, - size_t maxLength) - : textPaths_(textPaths), - maxLength_(maxLength) -{ - UTIL_THROW_IF2(textPaths.size() != vocabPaths.size(), +Corpus::Corpus(Ptr options) + : options_(options), + textPaths_(options_->get>("train-sets")), + maxLength_(options_->get("max-length")) { + + std::vector vocabPaths; + if(options_->has("vocabs")) + vocabPaths = options_->get>("vocabs"); + + UTIL_THROW_IF2(!vocabPaths.empty() && textPaths_.size() != vocabPaths.size(), "Number of corpus files and vocab files does not agree"); + std::vector maxVocabs = + options_->get>("dim-vocabs"); + std::vector vocabs; - for(int i = 0; i < vocabPaths.size(); ++i) { - vocabs_.emplace_back(vocabPaths[i], maxVocabs[i]); + if(vocabPaths.empty()) { + for(int i = 0; i < textPaths_.size(); ++i) { + Ptr vocab = New(); + vocab->loadOrCreate(textPaths_[i], maxVocabs[i]); + vocabs_.emplace_back(vocab); + } } + else { + for(int i = 0; i < vocabPaths.size(); ++i) { + Ptr vocab = New(); + vocab->load(vocabPaths[i], maxVocabs[i]); + vocabs_.emplace_back(vocab); + } + } + + + for(auto path : textPaths_) { + files_.emplace_back(new InputFileStream(path)); + } +} + +Corpus::Corpus(std::vector paths, + std::vector> vocabs, + Ptr options) + : options_(options), + textPaths_(paths), + vocabs_(vocabs), + maxLength_(options_->get("max-length")) { + + UTIL_THROW_IF2(textPaths_.size() != vocabs_.size(), + "Number of corpus files and vocab files does not agree"); for(auto path : textPaths_) { files_.emplace_back(new InputFileStream(path)); @@ -61,7 +95,7 @@ SentenceTuple Corpus::next() { for(int i = 0; i < files_.size(); ++i) { std::string line; if(std::getline((std::istream&)*files_[i], line)) { - Words words = vocabs_[i](line); + Words words = (*vocabs_[i])(line); if(words.empty()) words.push_back(0); tup.push_back(words); @@ -82,8 +116,15 @@ void Corpus::shuffle() { shuffleFiles(textPaths_); } +void Corpus::reset() { + files_.clear(); + for(auto& path : textPaths_) { + files_.emplace_back(new InputFileStream(path)); + } +} + void Corpus::shuffleFiles(const std::vector& paths) { - std::cerr << "Shuffling files" << std::endl; + LOG(data) << "Shuffling files"; std::vector> corpus; files_.clear(); @@ -129,7 +170,7 @@ void Corpus::shuffleFiles(const std::vector& paths) { files_.emplace_back(new InputFileStream(path)); } - std::cerr << "Done" << std::endl; + LOG(data) << "Done"; } } diff --git a/src/data/corpus.h b/src/data/corpus.h index 58233a6e..c3a0c358 100644 --- a/src/data/corpus.h +++ b/src/data/corpus.h @@ -4,6 +4,7 @@ #include #include +#include "training/config.h" #include "common/definitions.h" #include "data/vocab.h" #include "common/file_stream.h" @@ -38,11 +39,11 @@ class CorpusBatch { } std::cerr << std::endl; - std::cerr << "\t m: "; - for(auto w : b.second) { - std::cerr << w << " "; - } - std::cerr << std::endl; + //std::cerr << "\t m: "; + //for(auto w : b.second) { + //std::cerr << w << " "; + //} + //std::cerr << std::endl; } } } @@ -88,9 +89,11 @@ class CorpusIterator class Corpus { private: + Ptr options_; + std::vector textPaths_; std::vector> files_; - std::vector vocabs_; + std::vector> vocabs_; size_t maxLength_; void shuffleFiles(const std::vector& paths); @@ -102,14 +105,17 @@ class Corpus { typedef CorpusIterator iterator; typedef SentenceTuple sample; - Corpus(const std::vector& textPaths, - const std::vector& vocabPaths, - const std::vector& maxVocabs, - size_t maxLength = 50); + Corpus(Ptr options); + + Corpus(std::vector paths, + std::vector> vocabs, + Ptr options); sample next(); void shuffle(); + + void reset(); iterator begin() { return iterator(*this); @@ -118,6 +124,10 @@ class Corpus { iterator end() { return iterator(); } + + std::vector>& getVocabs() { + return vocabs_; + } batch_ptr toBatch(const std::vector& batchVector) { int batchSize = batchVector.size(); diff --git a/src/data/mnist.h b/src/data/mnist.h deleted file mode 100644 index bf1815dc..00000000 --- a/src/data/mnist.h +++ /dev/null @@ -1,188 +0,0 @@ -#pragma once - -// This file is part of the Marian toolkit. -// Marian is copyright (c) 2016 Marcin Junczys-Dowmunt. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - -#include -#include -#include -#include -#include -#include - -#include "dataset.h" -#include "batch_generator.h" - -namespace marian { -namespace data { - -/** @brief DataBase capable of reading MNIST data. */ -class MNIST : public DataBase { - private: - const int IMAGE_MAGIC_NUMBER; - const int LABEL_MAGIC_NUMBER; - - Examples examples_; - - public: - - typedef Batch batch_type; - typedef std::shared_ptr batch_ptr; - - /** - * @brief Constructs a DataBase using MNIST data. - * - * @param featuresPath Path to file containing MNIST feature values - * @param labelsPath Path to file containing MNIST labels - */ - MNIST(const std::string& featuresPath, - const std::string& labelsPath) - : IMAGE_MAGIC_NUMBER(2051), - LABEL_MAGIC_NUMBER(2049) - { - auto features = ReadImages(featuresPath); - auto labels = ReadLabels(labelsPath); - - UTIL_THROW_IF2(features.size() != labels.size(), - "Features do not match labels"); - - for(int i = 0; i < features.size(); ++i) - examples_.emplace_back(new Example({ features[i], labels[i] })); - } - - ExampleIterator begin() const { - return ExampleIterator(examples_.begin()); - } - - ExampleIterator end() const { - return ExampleIterator(examples_.end()); - } - - void shuffle() { - std::random_shuffle(examples_.begin(), examples_.end()); - } - - batch_ptr toBatch(const Examples& batchVector) { - int batchSize = batchVector.size(); - - std::vector maxDims; - for(auto& ex : batchVector) { - if(maxDims.size() < ex->size()) - maxDims.resize(ex->size(), 0); - for(int i = 0; i < ex->size(); ++i) { - if((*ex)[i]->size() > maxDims[i]) - maxDims[i] = (*ex)[i]->size(); - } - } - - batch_ptr batch(new Batch()); - std::vector iterators; - for(auto& m : maxDims) { - batch->push_back(Shape({batchSize, m})); - iterators.push_back(batch->inputs().back().begin()); - } - - for(auto& ex : batchVector) { - for(int i = 0; i < ex->size(); ++i) { - DataPtr d = (*ex)[i]; - d->resize(maxDims[i], 0.0f); - iterators[i] = std::copy(d->begin(), d->end(), iterators[i]); - } - } - return batch; - } - - private: - typedef unsigned char uchar; - - int reverseInt(int i) { - unsigned char c1, c2, c3, c4; - c1 = i & 255, c2 = (i >> 8) & 255, c3 = (i >> 16) & 255, c4 = (i >> 24) & 255; - return ((int)c1 << 24) + ((int)c2 << 16) + ((int)c3 << 8) + c4; - } - - std::vector ReadImages(const std::string& full_path) { - std::ifstream file(full_path); - UTIL_THROW_IF2(!file.is_open(), - "Cannot open file `" + full_path + "`!"); - - int magic_number = 0; - file.read((char *)&magic_number, sizeof(magic_number)); - magic_number = reverseInt(magic_number); - - UTIL_THROW_IF2(magic_number != IMAGE_MAGIC_NUMBER, - "Invalid MNIST image file!"); - - int number_of_images; - int n_rows = 0; - int n_cols = 0; - - file.read((char *)&number_of_images, sizeof(number_of_images)); - number_of_images = reverseInt(number_of_images); - file.read((char *)&n_rows, sizeof(n_rows)); - n_rows = reverseInt(n_rows); - file.read((char *)&n_cols, sizeof(n_cols)); - n_cols = reverseInt(n_cols); - - int imgSize = n_rows * n_cols; - std::vector _dataset(number_of_images); - for(int i = 0; i < number_of_images; ++i) { - _dataset[i].reset(new Data(imgSize, 0)); - for (int j = 0; j < imgSize; j++) { - unsigned char pixel = 0; - file.read((char*)&pixel, sizeof(pixel)); - (*_dataset[i])[j] = pixel / 255.0f; - } - } - return _dataset; - } - - std::vector ReadLabels(const std::string& full_path) { - std::ifstream file(full_path); - - if (! file.is_open()) - throw std::runtime_error("Cannot open file `" + full_path + "`!"); - - int magic_number = 0; - file.read((char *)&magic_number, sizeof(magic_number)); - magic_number = reverseInt(magic_number); - - if (magic_number != LABEL_MAGIC_NUMBER) - throw std::runtime_error("Invalid MNIST label file!"); - - int number_of_labels; - file.read((char *)&number_of_labels, sizeof(number_of_labels)); - number_of_labels = reverseInt(number_of_labels); - - std::vector _dataset(number_of_labels); - for (int i = 0; i < number_of_labels; i++) { - _dataset[i].reset(new Data(1, 0.0f)); - unsigned char label; - file.read((char*)&label, 1); - (*_dataset[i])[0] = label; - } - - return _dataset; - } -}; - -} // namespace mnist -} diff --git a/src/data/trainer.h b/src/data/trainer.h deleted file mode 100644 index 3c9420df..00000000 --- a/src/data/trainer.h +++ /dev/null @@ -1,142 +0,0 @@ -#pragma once - -#include -#include -#include - -#include "common/keywords.h" -#include "common/definitions.h" -#include "graph/expression_graph.h" -#include "optimizers/optimizers.h" -#include "data/batch_generator.h" - -namespace marian { - -class RunBase { - public: - virtual void run() = 0; -}; - -typedef std::shared_ptr RunBasePtr; - -template -class Trainer : public RunBase, - public keywords::Keywords { - private: - ExpressionGraphPtr graph_; - std::shared_ptr dataset_; - - public: - template - Trainer(ExpressionGraphPtr graph, - std::shared_ptr dataset, - Args... args) - : Keywords(args...), - graph_(graph), - dataset_(dataset) - {} - - void run() { - using namespace data; - using namespace keywords; - boost::timer::cpu_timer trainTimer; - - auto opt = Get(optimizer, Optimizer()); - auto batchSize = Get(batch_size, 200); - auto maxEpochs = Get(max_epochs, 50); - BatchGenerator bg(dataset_, batchSize); - - auto validator = Get(valid, RunBasePtr()); - - size_t update = 0; - for(int epoch = 1; epoch <= maxEpochs; ++epoch) { - boost::timer::cpu_timer epochTimer; - bg.prepare(); - - float cost = 0; - float totalExamples = 0; - while(bg) { - auto batch = bg.next(); - opt->update(graph_); - cost += graph_->get("cost")->val()->scalar() * batch->dim(); - totalExamples += batch->dim(); - update++; - } - cost = cost / totalExamples; - - std::cerr << "Epoch: " << std::setw(std::to_string(maxEpochs).size()) - << epoch << "/" << maxEpochs << " - Update: " << update - << " - Cost: " << std::fixed << std::setprecision(4) << cost - << " - Time: " << epochTimer.format(2, "%ws") - << " - " << trainTimer.format(0, "%ws") << std::endl; - - if(validator) - validator->run(); - } - } -}; - -template -class Validator : public RunBase, - public keywords::Keywords { - private: - ExpressionGraphPtr graph_; - std::shared_ptr dataset_; - - float correct(const std::vector pred, const std::vector labels) { - size_t num = labels.size(); - size_t scores = pred.size() / num; - size_t acc = 0; - for (size_t i = 0; i < num; ++i) { - size_t proposed = 0; - for(size_t j = 0; j < scores; ++j) { - if(pred[i * scores + j] > pred[i * scores + proposed]) - proposed = j; - } - acc += (proposed == labels[i]); - } - return (float)acc; - } - - public: - template - Validator(ExpressionGraphPtr graph, - std::shared_ptr dataset, - Args... args) - : Keywords(args...), - graph_(graph), - dataset_(dataset) - {} - - void run() { - using namespace data; - using namespace keywords; - - auto batchSize = Get(batch_size, 200); - BatchGenerator bg(dataset_, batchSize); - - size_t update = 0; - bg.prepare(false); - - float total = 0; - float cor = 0; - while(bg) { - auto batch = bg.next(); - graph_->forward(); - std::vector scores; - graph_->get("scores")->val()->get(scores); - - cor += correct(scores, batch->inputs()[1].data()); - total += batch->dim(); - update++; - } - std::cerr << "Accuracy: " << cor / total << std::endl; - } -}; - -template -RunBasePtr Run(Args&& ...args) { - return RunBasePtr(new Process(args...)); -} - -} diff --git a/src/data/types.h b/src/data/types.h index d126395d..4232a985 100644 --- a/src/data/types.h +++ b/src/data/types.h @@ -7,6 +7,8 @@ typedef size_t Word; typedef std::vector Words; -const Word EOS = 0; -const Word UNK = 1; +const Word EOS_ID = 0; +const Word UNK_ID = 1; +const std::string EOS_STR = ""; +const std::string UNK_STR = ""; diff --git a/src/data/vocab.cpp b/src/data/vocab.cpp index 372806f9..4d62c765 100644 --- a/src/data/vocab.cpp +++ b/src/data/vocab.cpp @@ -1,26 +1,15 @@ #include +#include #include "data/vocab.h" #include "common/utils.h" #include "common/file_stream.h" #include "3rd_party/exception.h" #include "3rd_party/yaml-cpp/yaml.h" +#include "common/logging.h" -Vocab::Vocab(const std::string& path, int max) { - YAML::Node vocab = YAML::Load(InputFileStream(path)); - for(auto&& pair : vocab) { - auto str = pair.first.as(); - auto id = pair.second.as(); - if (id < (Word)max) { - str2id_[str] = id; - if(id >= id2str_.size()) - id2str_.resize(id + 1); - id2str_[id] = str; - } - } - UTIL_THROW_IF2(id2str_.empty(), "Empty vocabulary " << path); - id2str_[0] = ""; +Vocab::Vocab() { } size_t Vocab::operator[](const std::string& word) const { @@ -28,7 +17,7 @@ size_t Vocab::operator[](const std::string& word) const { if(it != str2id_.end()) return it->second; else - return 1; + return UNK_ID; } Words Vocab::operator()(const std::vector& lineTokens, bool addEOS) const { @@ -36,7 +25,7 @@ Words Vocab::operator()(const std::vector& lineTokens, bool addEOS) std::transform(lineTokens.begin(), lineTokens.end(), words.begin(), [&](const std::string& w) { return (*this)[w]; }); if(addEOS) - words.push_back(EOS); + words.push_back(EOS_ID); return words; } @@ -49,7 +38,7 @@ Words Vocab::operator()(const std::string& line, bool addEOS) const { std::vector Vocab::operator()(const Words& sentence, bool ignoreEOS) const { std::vector decoded; for(size_t i = 0; i < sentence.size(); ++i) { - if(sentence[i] != EOS || !ignoreEOS) { + if(sentence[i] != EOS_ID || !ignoreEOS) { decoded.push_back((*this)[sentence[i]]); } } @@ -65,3 +54,91 @@ const std::string& Vocab::operator[](size_t id) const { size_t Vocab::size() const { return id2str_.size(); } + +void Vocab::loadOrCreate(const std::string& trainPath, int max) +{ + if(boost::filesystem::exists(trainPath + ".json")) { + load(trainPath + ".json", max); + return; + } + if(boost::filesystem::exists(trainPath + ".yml")) { + load(trainPath + ".yml", max); + return; + } + + create(trainPath + ".yml", max, trainPath); + load(trainPath + ".yml", max); +} + +void Vocab::load(const std::string& vocabPath, int max) +{ + LOG(data) << "Loading vocabulary from " << vocabPath << " (max: " << max << ")"; + YAML::Node vocab = YAML::Load(InputFileStream(vocabPath)); + for(auto&& pair : vocab) { + auto str = pair.first.as(); + auto id = pair.second.as(); + if (id < (Word)max) { + str2id_[str] = id; + if(id >= id2str_.size()) + id2str_.resize(id + 1); + id2str_[id] = str; + } + } + UTIL_THROW_IF2(id2str_.empty(), "Empty vocabulary " << vocabPath); + + id2str_[EOS_ID] = EOS_STR; + id2str_[UNK_ID] = UNK_STR; +} + +class Vocab::VocabFreqOrderer +{ +public: + bool operator()(const Vocab::Str2Id::value_type* a, const Vocab::Str2Id::value_type* b) const { + return a->second < b->second; + } +}; + +void Vocab::create(const std::string& vocabPath, int max, const std::string& trainPath) +{ + LOG(data) << "Creating vocabulary " << vocabPath + << " from " << trainPath << " (max: " << max << ")"; + + UTIL_THROW_IF2(boost::filesystem::exists(vocabPath), + "Vocab file " << vocabPath << " exist. Not overwriting"); + + InputFileStream trainStrm(trainPath); + + Str2Id vocab; + std::string line; + while (getline((std::istream&)trainStrm, line)) { + std::vector toks; + Split(line, toks); + + for (const std::string &tok: toks) { + Str2Id::iterator iter = vocab.find(tok); + if (iter == vocab.end()) + vocab[tok] = 1; + else + iter->second++; + } + } + + // put into vector & sort + std::vector vocabVec; + vocabVec.reserve(max); + + for (const Str2Id::value_type &p: vocab) + vocabVec.push_back(&p); + std::sort(vocabVec.rbegin(), vocabVec.rend(), VocabFreqOrderer()); + + YAML::Node vocabYaml; + vocabYaml[EOS_STR] = EOS_ID; + vocabYaml[UNK_STR] = UNK_ID; + for(size_t i = 0; i < vocabVec.size(); ++i) { + const Str2Id::value_type *p = vocabVec[i]; + vocabYaml[p->first] = i + 2; + } + + OutputFileStream vocabStrm(vocabPath); + (std::ostream&)vocabStrm << vocabYaml; +} diff --git a/src/data/vocab.h b/src/data/vocab.h index e7e2c416..e61ad605 100644 --- a/src/data/vocab.h +++ b/src/data/vocab.h @@ -8,7 +8,7 @@ class Vocab { public: - Vocab(const std::string& path, int max = 50000); + Vocab(); size_t operator[](const std::string& word) const; @@ -22,7 +22,16 @@ class Vocab { size_t size() const; + void loadOrCreate(const std::string& textPath, int max); + void load(const std::string& vocabPath, int max); + void create(const std::string& vocabPath, int max, const std::string& trainPath); + private: - std::map str2id_; - std::vector id2str_; + typedef std::map Str2Id; + Str2Id str2id_; + + typedef std::vector Id2Str; + Id2Str id2str_; + + class VocabFreqOrderer; }; diff --git a/src/graph/chainable.h b/src/graph/chainable.h index ef358325..07bc294e 100644 --- a/src/graph/chainable.h +++ b/src/graph/chainable.h @@ -23,6 +23,7 @@ #include #include +#include #include "exception.h" @@ -106,6 +107,8 @@ struct Chainable { virtual void debug(const std::string& message) = 0; virtual bool marked_for_debug() = 0; virtual const std::string& debug_message() = 0; + + virtual size_t hash() = 0; }; /** diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h index 15885553..dc30803f 100644 --- a/src/graph/expression_graph.h +++ b/src/graph/expression_graph.h @@ -1,26 +1,5 @@ #pragma once -// This file is part of the Marian toolkit. -// Marian is copyright (c) 2016 Marcin Junczys-Dowmunt. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - #include #include #include @@ -32,7 +11,9 @@ #include "data/batch_generator.h" #include "tensors/tensor_allocator.h" #include "layers/param_initializers.h" +#include "kernels/dropout.h" #include "3rd_party/threadpool.h" +#include "3rd_party/cnpy/cnpy.h" namespace marian { @@ -66,9 +47,10 @@ class ExpressionGraph : public std::enable_shared_from_this { Ptr tensors_; cublasHandle_t cublasHandle_; + curandGenerator_t curandGenerator_; size_t device_{0}; - - size_t stale_{0}; + + std::unordered_map hashMap_; protected: /** @brief Constructs a new expression graph @@ -84,17 +66,26 @@ class ExpressionGraph : public std::enable_shared_from_this { public: + ~ExpressionGraph() { + clear(); + } + void setDevice(size_t device = 0) { device_ = device; params_.init(device); tensors_ = New(device); cublasHandle_ = create_handle(device); + curandGenerator_ = createCurandGenerator(device, 1234); } cublasHandle_t getCublasHandle() { return cublasHandle_; } + curandGenerator_t getCurandGenerator() { + return curandGenerator_; + } + size_t getDevice() { return device_; } @@ -132,26 +123,34 @@ class ExpressionGraph : public std::enable_shared_from_this { * @param batchSize XXX Marcin, could you provide a description of this param? */ - void forward() { + size_t forward() { params_.allocateForward(); - for(auto&& tape : tapes_) { - for(auto&& v : tape) { - v->allocate(); - v->init(); - v->forward(); + return forward(0); + } - // @TODO: should be done in node - for(auto&& child : v->children()) { - v->decreaseEdges(1); - child->decreaseEdges(1); - } + size_t forward(size_t pos) { + // @TODO: check if allocation works properly - if(v->marked_for_debug()) { - std::cerr << "Debug: " << v->debug_message() << std::endl; - std::cerr << v->val()->debug() << std::endl; - } + auto it = nodes_.begin() + pos; + while(it != nodes_.end()) { + auto v = *it; + v->allocate(); + v->init(); + v->forward(); + + // @TODO: should be done in node + for(auto&& child : v->children()) { + v->decreaseEdges(1); + child->decreaseEdges(1); } + + if(v->marked_for_debug()) { + std::cerr << "Debug: " << v->debug_message() << std::endl; + std::cerr << v->val()->debug() << std::endl; + } + it++; } + return std::distance(nodes_.begin(), it); } /** @@ -172,7 +171,7 @@ class ExpressionGraph : public std::enable_shared_from_this { params_.allocateBackward(); params_.set_zero_adjoint(); - + for(auto&& v : topNodes_) v->init_dependent(); @@ -202,7 +201,7 @@ class ExpressionGraph : public std::enable_shared_from_this { it++; } } - + /** * @brief Returns a string representing this expression graph in graphviz notation. * @@ -300,8 +299,6 @@ class ExpressionGraph : public std::enable_shared_from_this { * * This method does not attach the new constant node to any existing expression graph. * - * @param args XXX Marcin, what are args here? - * * @return a newly constructed constant node */ template @@ -341,6 +338,17 @@ class ExpressionGraph : public std::enable_shared_from_this { args...); } + template + inline Expr dropout(float prob, Shape shape) { + auto dropoutInit = [prob, this](Tensor t) { + Dropout(t, prob, getCurandGenerator()); + }; + + return Expression(shared_from_this(), + keywords::init=dropoutInit, + keywords::shape=shape); + } + /*********************************************************/ /** @@ -387,10 +395,18 @@ class ExpressionGraph : public std::enable_shared_from_this { named_.emplace(name, e); } - void add(Expr node) { + Expr add(Expr node) { size_t group = 0; + size_t hash = node->hash(); + auto it = hashMap_.find(hash); + if(it != hashMap_.end()) + return it->second; + + hashMap_[hash] = node; + node->setId(count_++); + for(auto& child: node->children()) { group = std::max(group, tapeMap_[child] + 1); child->increaseEdges(2); @@ -402,6 +418,8 @@ class ExpressionGraph : public std::enable_shared_from_this { tapes_[group].push_back(node); nodes_.push_back(node); topNodes_.insert(node); + + return node; } void remove_top_node(Expr node) { @@ -428,18 +446,72 @@ class ExpressionGraph : public std::enable_shared_from_this { inputs_.clear(); topNodes_.clear(); tensors_->clear(); + hashMap_.clear(); } - + Expr topNode() { return nodes_.back(); } + + void load(const std::string& name) { + using namespace keywords; + + LOG(info) << "Loading model from " << name; + + auto numpy = cnpy::npz_load(name); + + for(auto it : numpy) { + auto name = it.first; + + Shape shape; + if(it.second.shape.size() == 2) { + shape.set(0, it.second.shape[0]); + shape.set(1, it.second.shape[1]); + } + else if(it.second.shape.size() == 1) { + shape.set(0, 1); + shape.set(1, it.second.shape[0]); + } + + param(name, shape, + init=inits::from_numpy(it.second)); + } + } + + void save(const std::string& name) { + LOG(info) << "Saving model to " << name; + + unsigned shape[2]; + std::string mode = "w"; + + cudaSetDevice(getDevice()); + for(auto p : params().getMap()) { + std::vector v; + p.second->val() >> v; + + unsigned dim; + if(p.second->shape()[0] == 1) { + shape[0] = p.second->shape()[1]; + dim = 1; + } + else { + shape[0] = p.second->shape()[0]; + shape[1] = p.second->shape()[1]; + dim = 2; + } + std::string pName = p.first; + cnpy::npz_save(name, pName, v.data(), shape, dim, mode); + mode = "a"; + } + } }; template Expr Expression(Args&& ... args) { + // @TODO check hash, if exists do not add and return + // cached node to minimize calculations auto e = Expr(new T(std::forward(args)...)); - e->graph()->add(e); - return e; + return e->graph()->add(e); } } diff --git a/src/graph/expression_operators.cu b/src/graph/expression_operators.cu index aa0a8245..40687bd1 100644 --- a/src/graph/expression_operators.cu +++ b/src/graph/expression_operators.cu @@ -118,11 +118,6 @@ Expr tanh(const std::vector& nodes) { return Expression(nodes); } -//Expr tanh(Expr a, Expr b, Expr c) { -// std::vector nodes = {a, b, c}; -// return Expression(nodes); -//} - Expr logit(const std::vector&) { UTIL_THROW2("Not implemented"); } @@ -131,5 +126,30 @@ Expr relu(const std::vector&) { UTIL_THROW2("Not implemented"); } +Expr sqrt(Expr a, float eps) { + return Expression(a, eps); +} + +Expr square(Expr a) { + return Expression(a); +} + +Expr layer_norm(Expr x, Expr gamma, Expr beta) { + std::vector nodes = {x, gamma}; + if(beta) + nodes.push_back(beta); + return Expression(nodes); +} + +//Expr batch_norm(Expr x, Expr gamma, Expr beta) { +// auto mju = mean(x, keywords::axis=0); +// auto xmmju = x - mju; +// auto std = sqrt(mean(square(xmmju), keywords::axis=0), 1e-9); +// +// if(beta) +// return gamma * (xmmju / std) + beta; +// else +// return gamma * (xmmju / std); +//} } diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h index 095d9219..734ef372 100644 --- a/src/graph/expression_operators.h +++ b/src/graph/expression_operators.h @@ -143,4 +143,25 @@ Expr weighted_average(Expr in, Expr weights, Args ...args) { Expr step(Expr a, size_t step); +Expr sqrt(Expr a, float eps = 0.f); +Expr square(Expr a); + +Expr layer_norm(Expr x, Expr gamma, Expr beta = nullptr); +//Expr batch_norm(Expr x, Expr gamma, Expr beta = nullptr); + +template +Expr dropout(Expr x, Args ...args) { + auto mask = Get(keywords::mask, nullptr, args...); + float dropout_prob = Get(keywords::dropout_prob, 0.0f, args...); + + UTIL_THROW_IF2(!mask && !dropout_prob, + "Neither mask nor dropout prob given"); + if(!mask) { + auto graph = x->graph(); + mask = graph->dropout(dropout_prob, x->shape()); + } + return x * mask; +} + + } diff --git a/src/graph/node.h b/src/graph/node.h index 674eef53..65de3880 100644 --- a/src/graph/node.h +++ b/src/graph/node.h @@ -186,6 +186,7 @@ class Node : public Chainable, }; struct NaryNodeOp : public Node { + size_t hash_{0}; std::vector children_; template @@ -205,6 +206,17 @@ struct NaryNodeOp : public Node { return children_; } + virtual size_t hash() { + if(!hash_) { + std::size_t seed = boost::hash()(name()); + boost::hash_combine(seed, type()); + for(auto child : children()) + boost::hash_combine(seed, child->hash()); + hash_ = seed; + } + return hash_; + } + void remove_children_from_top_nodes(); }; diff --git a/src/graph/node_operators.h b/src/graph/node_operators.h index af5bf315..98701e3a 100644 --- a/src/graph/node_operators.h +++ b/src/graph/node_operators.h @@ -82,6 +82,11 @@ struct ConstantNode : public Node { return "white"; } + virtual size_t hash() { + // @TODO: think of something better for constant nodes + return boost::hash()((size_t)this); + } + private: std::function init_; bool initialized_; @@ -117,6 +122,10 @@ struct ParamNode : public Node { return "orangered"; } + virtual size_t hash() { + return boost::hash()((size_t)this); + } + private: std::function init_; bool initialized_; diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h index ba6f47bc..51d4b2f9 100644 --- a/src/graph/node_operators_binary.h +++ b/src/graph/node_operators_binary.h @@ -20,13 +20,8 @@ struct DotNodeOp : public NaryNodeOp { auto shapeA = a->shape(); auto shapeB = b->shape(); - Shape outShape; - if((shapeA[2] > 1 || shapeA[3] > 1) && shapeB[2] == 1 && shapeB[3] == 1) - outShape = {shapeA[0], shapeB[1], shapeA[2], shapeA[3]}; - else { - outShape = shapeA; - outShape.set(1, shapeB[1]); - } + Shape outShape = shapeA; + outShape.set(1, shapeB[1]); UTIL_THROW_IF2(shapeA[1] != shapeB[0], "matrix product requires dimensions to match"); return outShape; @@ -338,6 +333,12 @@ struct ConcatenateNodeOp : public NaryNodeOp { Deconcatenate(deconcatenees, adj_, ax_); } + virtual size_t hash() { + size_t seed = NaryNodeOp::hash(); + boost::hash_combine(seed, ax_); + return seed; + } + const std::string type() { return "concat"; } @@ -437,5 +438,33 @@ struct AffineNodeOp : public NaryNodeOp { } }; +struct LayerNormalizationOp : public NaryNodeOp { + LayerNormalizationOp(const std::vector& nodes) + : NaryNodeOp(nodes) {} + + NodeOps forwardOps() { + return { + NodeOp( + LayerNormalization(val_, + children_[0]->val(), + children_[1]->val(), + (children_.size() == 3) ? children_[2]->val() : nullptr)) + }; + } + + NodeOps backwardOps() { + return { + NodeOp(LayerNormalizationGrad(children_[0]->grad(), children_[1]->grad(), (children_.size() == 3) ? children_[2]->grad() : nullptr, + adj_, val_, children_[0]->val(), children_[1]->val(), + (children_.size() == 3) ? children_[2]->val() : nullptr)) + }; + } + + const std::string type() { + return "layer_normalization"; + } + +}; + } diff --git a/src/graph/node_operators_unary.h b/src/graph/node_operators_unary.h index f58df3bc..2da3c463 100644 --- a/src/graph/node_operators_unary.h +++ b/src/graph/node_operators_unary.h @@ -232,6 +232,16 @@ struct SoftmaxNodeOp : public NaryNodeOp { }; } + virtual size_t hash() { + if(!hash_) { + hash_ = NaryNodeOp::hash(); + if(mask_) + boost::hash_combine(hash_, mask_->hash()); + } + return hash_; + } + + NodeOps backwardOps() { // For each row, the Jacobian times vector is given by: // J * dy = p .* (dy - avg*1) @@ -281,9 +291,12 @@ struct LogSoftmaxNodeOp : public UnaryNodeOp { }; struct SumNodeOp : public UnaryNodeOp { + int ax_; + template SumNodeOp(Expr a, Args ...args) - : UnaryNodeOp(a, keywords::shape=newShape(a, args...), args...) { } + : UnaryNodeOp(a, keywords::shape=newShape(a, args...), args...), + ax_(keywords::Get(keywords::axis, -1, args...)) { } NodeOps forwardOps() { return { NodeOp(Reduce(_1, val_, children_[0]->val())) }; @@ -317,19 +330,31 @@ struct SumNodeOp : public UnaryNodeOp { return "orange"; } + virtual size_t hash() { + if(!hash_) { + hash_ = NaryNodeOp::hash(); + boost::hash_combine(hash_, ax_); + } + return hash_; + } + + }; struct MeanNodeOp : public UnaryNodeOp { + int ax_; + template MeanNodeOp(Expr a, Args ...args) - : UnaryNodeOp(a, keywords::shape=newShape(a, args...), args...) { } + : UnaryNodeOp(a, keywords::shape=newShape(a, args...), args...), + ax_(keywords::Get(keywords::axis, -1, args...)) { } NodeOps forwardOps() { int left = children_[0]->shape().elements() / val_->shape().elements(); float scale = 1.f / left; return { - NodeOp(Reduce(_1 * scale, val_, children_[0]->val())) + NodeOp(Reduce(_1, val_, children_[0]->val(), scale)) }; } @@ -338,7 +363,7 @@ struct MeanNodeOp : public UnaryNodeOp { float scale = 1.f / left; return { - NodeOp(Add(_1 * scale, children_[0]->grad(), adj_)) + NodeOp(Add(_1, children_[0]->grad(), adj_, scale)) }; } @@ -365,6 +390,15 @@ struct MeanNodeOp : public UnaryNodeOp { const std::string color() { return "orange"; } + + virtual size_t hash() { + if(!hash_) { + hash_ = NaryNodeOp::hash(); + boost::hash_combine(hash_, ax_); + } + return hash_; + } + }; @@ -423,6 +457,78 @@ struct ExpNodeOp : public UnaryNodeOp { }; +struct SqrtNodeOp : public UnaryNodeOp { + float epsilon_; + + template + SqrtNodeOp(Expr a, float epsilon, Args ...args) + : UnaryNodeOp(a, args...), + epsilon_(epsilon) { } + + NodeOps forwardOps() { + return { + NodeOp(Element(_1 = Sqrt(_2 + epsilon_), + val_, + children_[0]->val())) + }; + } + + NodeOps backwardOps() { + return { + NodeOp(Add(0.5f * (1.f / _1) * _2, + children_[0]->grad(), + val_, + adj_)) + }; + } + + const std::string type() { + return "sqrt"; + } + + virtual size_t hash() { + if(!hash_) { + size_t seed = NaryNodeOp::hash(); + boost::hash_combine(seed, epsilon_); + hash_ = seed; + } + return hash_; + } + + +}; + +struct SquareNodeOp : public UnaryNodeOp { + float epsilon_; + + template + SquareNodeOp(Args ...args) + : UnaryNodeOp(args...) { } + + NodeOps forwardOps() { + return { + NodeOp(Element(_1 = _2 * _2, + val_, + children_[0]->val())) + }; + } + + NodeOps backwardOps() { + return { + NodeOp(Add(2.f * _1 * _2, + children_[0]->grad(), + children_[0]->val(), + adj_)) + }; + } + + const std::string type() { + return "square"; + } + +}; + + struct NegNodeOp : public UnaryNodeOp { template NegNodeOp(Args ...args) @@ -489,6 +595,17 @@ struct RowsNodeOp : public UnaryNodeOp { return "orange"; } + virtual size_t hash() { + if(!hash_) { + size_t seed = NaryNodeOp::hash(); + for(auto i : indeces_) + boost::hash_combine(seed, i); + hash_ = seed; + } + return hash_; + } + + std::vector indeces_; }; @@ -567,6 +684,17 @@ struct ReshapeNodeOp : public UnaryNodeOp { const std::string color() { return "grey"; } + + virtual size_t hash() { + if(!hash_) { + size_t seed = NaryNodeOp::hash(); + for(auto s : shape()) + boost::hash_combine(seed, s); + hash_ = seed; + } + return hash_; + } + }; struct TimestepNodeOp : public UnaryNodeOp { @@ -619,6 +747,15 @@ struct TimestepNodeOp : public UnaryNodeOp { const std::string color() { return "grey"; } + + virtual size_t hash() { + if(!hash_) { + hash_ = NaryNodeOp::hash(); + boost::hash_combine(hash_, step_); + } + return hash_; + } + }; } diff --git a/src/kernels/dropout.cu b/src/kernels/dropout.cu new file mode 100644 index 00000000..358f8871 --- /dev/null +++ b/src/kernels/dropout.cu @@ -0,0 +1,54 @@ +#include +#include + +#include "kernels/dropout.h" + + +#define CUDA_CALL(x) do { if((x)!=cudaSuccess) { \ + printf("Error at %s:%d\n",__FILE__,__LINE__);\ + exit(1);}} while(0) + +#define CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \ + printf("Error at %s:%d\n",__FILE__,__LINE__);\ + exit(1);}} while(0) + +namespace marian { + +curandGenerator_t createCurandGenerator(size_t device, + size_t seed) { + cudaSetDevice(device); + curandGenerator_t generator; + CURAND_CALL(curandCreateGenerator(&generator, + CURAND_RNG_PSEUDO_DEFAULT)); + CURAND_CALL(curandSetPseudoRandomGeneratorSeed(generator, seed)); + + //cudaStream_t stream = 0; + //CURAND_CALL(curandSetStream(generator, stream)); + //CURAND_CALL(curandDestroyGenerator(generator)); + return generator; +} + + +__global__ +void gScale(float* data, int n, float p) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + + while (index < n) { + data[index] = (data[index] < p) / p; + index += gridDim.x * blockDim.x; + } +} + +void Dropout(Tensor tensor, float p, + curandGenerator_t gen) { + + int n = tensor->size(); + CURAND_CALL(curandGenerateUniform(gen, tensor->data(), n)); + + int numThreads = std::min(n, 512); + int numBlocks = n / numThreads + (n % numThreads != 0); + + gScale<<>>(tensor->data(), n, 1.f - p); +} + +} diff --git a/src/kernels/dropout.h b/src/kernels/dropout.h new file mode 100644 index 00000000..90fd996f --- /dev/null +++ b/src/kernels/dropout.h @@ -0,0 +1,15 @@ +#include +#include +#include +#include + +#include "tensors/tensor.h" + +namespace marian { + +curandGenerator_t createCurandGenerator(size_t device, size_t seed=1234); + +void Dropout(Tensor tensor, float h, + curandGenerator_t gen); + +} diff --git a/src/kernels/dropout_cudnn.cu b/src/kernels/dropout_cudnn.cu deleted file mode 100644 index f2190832..00000000 --- a/src/kernels/dropout_cudnn.cu +++ /dev/null @@ -1,70 +0,0 @@ -#include "dropout_cudnn.h" - -#include "tensors/tensor.h" - -namespace marian { - -static cudnnHandle_t create_handle_dnn() { - cudnnHandle_t cudnnHandle; - cudnnCreate(&cudnnHandle); - return cudnnHandle; -} - -cudnnHandle_t cudnnHandle = create_handle_dnn(); - -void CudnnDropoutPrepare(Tensor in, float p, - cudnnDropoutDescriptor_t* dropDesc, - void** space, size_t* spaceSize, - void** states, size_t seed) { - size_t statesSize; - cudnnDropoutGetStatesSize(cudnnHandle, &statesSize); - cudnnDropoutGetReserveSpaceSize(in->cudnn(), spaceSize); - - cudaMalloc((void**)states, statesSize); - cudaMalloc((void**)space, *spaceSize); - - cudnnCreateDropoutDescriptor(dropDesc); - cudnnSetDropoutDescriptor(*dropDesc, - cudnnHandle, - p, - (void*)*states, - statesSize, - seed); -} - -void CudnnDropoutDestroy(cudnnDropoutDescriptor_t dropDesc, - void* space, void* states) { - cudnnDestroyDropoutDescriptor(dropDesc); - cudaFree(space); - cudaFree(states); -} - -void CudnnDropoutForward(cudnnDropoutDescriptor_t dropoutDesc, - void* space, size_t spaceSize, - Tensor out, Tensor in) { - cudnnDropoutForward(cudnnHandle, - dropoutDesc, - in->cudnn(), - in->data(), - out->cudnn(), - out->data(), - space, - spaceSize); -} - -/* void CudnnDropoutBackward(cudnnDropoutDescriptor_t dropoutDesc, */ - /* void* space, size_t spaceSize, */ - /* Tensor out, Tensor in) { */ - /* auto inGpu = static_cast(in.get()); */ - /* auto outGpu = static_cast(out.get()); */ - /* cudnnDropoutBackward(cudnnHandle, */ - /* dropoutDesc, */ - /* inGpu->cudnn(), */ - /* inGpu->data(), */ - /* outGpu->cudnn(), */ - /* outGpu->data(), */ - /* space, */ - /* spaceSize); */ -/* } */ - -} diff --git a/src/kernels/dropout_cudnn.h b/src/kernels/dropout_cudnn.h deleted file mode 100644 index 6f51c68b..00000000 --- a/src/kernels/dropout_cudnn.h +++ /dev/null @@ -1,24 +0,0 @@ -#pragma once -#include - -#include "tensors/tensor.h" - -namespace marian { - -void CudnnDropoutPrepare(Tensor in, float p, - cudnnDropoutDescriptor_t* dropDesc, - void** space, size_t* spaceSize, - void** states, size_t seed); - -void CudnnDropoutDestroy(cudnnDropoutDescriptor_t dropDesc, - void* space, void* states); - -void CudnnDropoutForward(cudnnDropoutDescriptor_t dropoutDesc, - void* space, size_t spaceSize, - Tensor out, Tensor in); - -void CudnnDropoutBackward(cudnnDropoutDescriptor_t dropoutDesc, - void* space, size_t spaceSize, - Tensor out, Tensor in); - -} diff --git a/src/kernels/tensor_operators.cu b/src/kernels/tensor_operators.cu index da4e0efa..254c04d7 100644 --- a/src/kernels/tensor_operators.cu +++ b/src/kernels/tensor_operators.cu @@ -25,7 +25,7 @@ #include "3rd_party/reduce_all.h" namespace marian { - + cublasHandle_t create_handle(size_t device) { cudaSetDevice(device); @@ -116,7 +116,7 @@ __global__ void gSoftmax(float* out, const Shape outShape, const float* in, const float* mask) { - int rows = outShape[0]; + int rows = outShape[0] * outShape[2] * outShape[3]; int cols = outShape[1]; for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; @@ -129,7 +129,7 @@ __global__ void gSoftmax(float* out, float* _max = _share + blockDim.x; _max[threadIdx.x] = sp[threadIdx.x]; // mask - for(int tid = 1; tid < cols; tid += blockDim.x) { + for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if (id < cols) { if (sp[id] > _max[threadIdx.x]) @@ -210,7 +210,7 @@ void Softmax(Tensor out, Tensor in, Tensor mask) { __global__ void gLogSoftmax(float* out, const Shape outShape, const float* in) { - int rows = outShape[0]; + int rows = outShape[0] * outShape[2] * outShape[3]; int cols = outShape[1]; for(int bid = 0; bid < rows; bid += gridDim.x) { int j = bid + blockIdx.x; @@ -222,7 +222,7 @@ __global__ void gLogSoftmax(float* out, float* _max = _share + blockDim.x; _max[threadIdx.x] = sp[threadIdx.x]; - for(int tid = 1; tid < cols; tid += blockDim.x) { + for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if (id < cols) { if (sp[id] > _max[threadIdx.x]) _max[threadIdx.x] = sp[id]; @@ -277,8 +277,8 @@ __global__ void gLogSoftmax(float* out, void LogSoftmax(Tensor out, Tensor in) { cudaSetDevice(out->getDevice()); - - size_t m = out->shape()[0]; + + size_t m = out->shape()[0] * out->shape()[2] * out->shape()[3]; size_t k = out->shape()[1]; int blocks = std::min(MAX_BLOCKS, (int) m); @@ -392,12 +392,12 @@ __global__ void gLogSoftmaxGrad(float* grad, const float* adj, const float* val, void LogSoftmaxGrad(Tensor grad, Tensor adj, Tensor val) { cudaSetDevice(adj->getDevice()); - + // grad and val are both m-by-k matrices, passed as input. // A weighted average of each row of grad (according to the weights // specified in val) is computed and subtracted from Out. // adj is multiplied for each element to get backward step in autodiff - int m = grad->shape()[0]; + int m = grad->shape()[0] * grad->shape()[2] * grad->shape()[3]; int k = grad->shape()[1]; int blocks = std::min(MAX_BLOCKS, m); @@ -548,7 +548,7 @@ __global__ void gCopyRows(float* out, const float* in, size_t cols, void CopyRows(Tensor out, const Tensor in, const std::vector& indeces) { cudaSetDevice(out->getDevice()); - + size_t cols = in->shape()[1]; size_t rowsToCopy = indeces.size(); @@ -589,7 +589,7 @@ __global__ void gPasteRows(float* out, const float* in, size_t cols, void PasteRows(Tensor out, const Tensor in, const std::vector& indeces) { cudaSetDevice(out->getDevice()); - + size_t cols = in->shape()[1]; size_t rowsToCopy = indeces.size(); @@ -610,19 +610,23 @@ void PasteRows(Tensor out, const Tensor in, const std::vector& indeces) void Transpose(cublasHandle_t cublasHandle, Tensor out, const Tensor in) { cudaSetDevice(out->getDevice()); - - size_t m = in->shape()[0]; - size_t n = in->shape()[1]; - float alpha = 1.0; - float beta = 0.0; + size_t steps = in->shape()[2] * in->shape()[3]; + for(int i = 0; i < steps; i++) { + size_t m = in->shape()[0]; + size_t n = in->shape()[1]; + float alpha = 1.0; + float beta = 0.0; - cublasSgeam(cublasHandle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, &alpha, in->data(), n, - &beta, in->data(), n, out->data(), m); + size_t offset = i * steps; + + cublasSgeam(cublasHandle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, &alpha, in->data() + offset, n, + &beta, in->data() + offset, n, out->data() + offset, m); + } } void Concatenate0(Tensor out, const std::vector& inputs) { cudaSetDevice(out->getDevice()); - + size_t offset = 0; for(auto in : inputs) { UTIL_THROW_IF2(out->shape()[1] != in->shape()[1], @@ -658,9 +662,9 @@ __global__ void gInsertCols(float* out, const float* in, // dimensions, verify this! void Concatenate1(Tensor out, const std::vector& inputs) { cudaSetDevice(out->getDevice()); - + size_t offset = 0; - int rows = out->shape()[0]; + int rows = out->shape()[0] * out->shape()[2] * out->shape()[3]; int cols_out = out->shape()[1]; for(auto in : inputs) { @@ -690,7 +694,7 @@ void Concatenate(Tensor out, const std::vector& inputs, int ax) { void Deconcatenate0(std::vector& outputs, const Tensor in) { cudaSetDevice(in->getDevice()); - + size_t offset = 0; for(auto out : outputs) { cudaMemcpy(out->data(), @@ -703,9 +707,9 @@ void Deconcatenate0(std::vector& outputs, const Tensor in) { void Deconcatenate1(std::vector& outputs, const Tensor in) { cudaSetDevice(in->getDevice()); - + size_t offset = 0; - int rows = in->shape()[0]; + int rows = in->shape()[0] * in->shape()[2] * in->shape()[3]; int cols_in = in->shape()[1]; for(auto out : outputs) { UTIL_THROW_IF2(out->shape()[0] != in->shape()[0], @@ -778,8 +782,8 @@ __global__ void gGRUFastForward(float* out, void GRUFastForward(Tensor out, std::vector inputs, bool final){ cudaSetDevice(out->getDevice()); - - int rows = out->shape()[0]; + + int rows = out->shape()[0] * out->shape()[2] * out->shape()[3]; int cols = out->shape()[1]; int blocks = std::min(MAX_BLOCKS, rows); @@ -881,10 +885,10 @@ __global__ void gGRUFastBackward(float* outState, void GRUFastBackward(std::vector outputs, std::vector inputs, Tensor adj, bool final) { - + cudaSetDevice(adj->getDevice()); - - int rows = adj->shape()[0]; + + int rows = adj->shape()[0] * adj->shape()[2] * adj->shape()[3]; int cols = adj->shape()[1]; int blocks = std::min(MAX_BLOCKS, rows); @@ -975,7 +979,7 @@ __global__ void gCrossEntropyPick(float* out, void CrossEntropyPick(Tensor out, Tensor in, Tensor pick) { cudaSetDevice(out->getDevice()); - + size_t m = in->shape()[0]; size_t k = in->shape()[1]; @@ -1065,7 +1069,7 @@ __global__ void gCrossEntropyPickBackward(float* out, void CrossEntropyPickBackward(Tensor out, Tensor adj, Tensor a, Tensor pick) { cudaSetDevice(out->getDevice()); - + size_t m = out->shape()[0]; size_t k = out->shape()[1]; @@ -1082,7 +1086,7 @@ void CrossEntropyPickBackward(Tensor out, Tensor adj, Tensor a, Tensor pick) { float L2Norm(Tensor in) { cudaSetDevice(in->getDevice()); - + float* data; cudaMalloc(&data, sizeof(float)); Tensor out(new TensorBase(data, {1, 1}, in->getDevice())); @@ -1094,21 +1098,24 @@ float L2Norm(Tensor in) { } __global__ void gAtt(float* out, - const float* in1, - const float* in2, - const float* in3, - int m, // rows - int k, // cols - int n // rows of in2 - ) { + const float* va, + const float* ctx, + const float* state, + const float* cov, + int m, // total rows (batch x time x beam) + int k, // depth + int b, // batch size + int t // time of ctx + ) { int rows = m; int cols = k; for(int bid = 0; bid < m; bid += gridDim.x) { int j = bid + blockIdx.x; if(j < rows) { - const float* in1Row = in1 + j * cols; - const float* in2Row = in2 + (j % n) * cols; - const float* in3Row = in3; + const float* vaRow = va; + const float* ctxRow = ctx + (j % (b * t)) * cols; + const float* stateRow = state + (j / (b * t) + j % b) * cols; + const float* covRow = cov ? cov + (j % (b * t)) * cols : nullptr; extern __shared__ float _share[]; float* _sum = _share + blockDim.x; @@ -1117,7 +1124,10 @@ __global__ void gAtt(float* out, for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id < cols) { - float ex = tanhf(in1Row[id] + in2Row[id]) * in3Row[id]; + float z = ctxRow[id] + stateRow[id]; + if(cov) + z += covRow[id]; + float ex = tanhf(z) * vaRow[id]; _sum[threadIdx.x] += ex; } } @@ -1136,33 +1146,39 @@ __global__ void gAtt(float* out, } } -void Att(Tensor out, Tensor context, Tensor state, Tensor va) { +void Att(Tensor out, + Tensor va, + Tensor context, + Tensor state, + Tensor coverage) { cudaSetDevice(out->getDevice()); - - size_t m = context->shape()[0] * context->shape()[2] * context->shape()[3]; - size_t k = context->shape()[1]; - size_t n = context->shape()[0]; + size_t m = out->shape()[0] * out->shape()[2] * out->shape()[3]; + + size_t b = context->shape()[0]; + size_t k = context->shape()[1]; + size_t t = context->shape()[2]; int blocks = std::min(MAX_BLOCKS, (int) m); int threads = std::min(MAX_THREADS, (int) k); int shared = sizeof(float) * threads * 2; - - gAtt<<>>(out->data(), + va->data(), context->data(), state->data(), - va->data(), - m, k, n); + coverage ? coverage->data() : nullptr, + m, k, b, t); } -__global__ void gAttBack(float* gContext, +__global__ void gAttBack(float* gVa, + float* gContext, float* gState, - float* gVa, + float* gCoverage, + const float* va, const float* context, const float* state, - const float* va, + const float* coverage, const float* adj, int m, // rows int k, // cols @@ -1175,18 +1191,26 @@ __global__ void gAttBack(float* gContext, if(j < rows) { float* gcRow = gContext + j * cols; float* gsRow = gState + (j % n) * cols; + float* gcovRow = gCoverage ? gCoverage + j * cols : nullptr; const float* cRow = context + j * cols; const float* sRow = state + (j % n) * cols; + const float* covRow = coverage ? coverage + j * cols : nullptr; for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id < cols) { - float t = tanhf(cRow[id] + sRow[id]); + float z = cRow[id] + sRow[id]; + if(coverage) + z += covRow[id]; + + float t = tanhf(z); float r = va[id] * (1.f - t * t); gcRow[id] += r * adj[j]; gsRow[id] += r * adj[j]; + if(gCoverage) + gcovRow[id] += r * adj[j]; atomicAdd(gVa + id, t * adj[j]); } } @@ -1195,11 +1219,11 @@ __global__ void gAttBack(float* gContext, } -void AttBack(Tensor gContext, Tensor gState, Tensor gVa, - Tensor context, Tensor state, Tensor va, +void AttBack(Tensor gVa, Tensor gContext, Tensor gState, Tensor gCoverage, + Tensor va, Tensor context, Tensor state, Tensor coverage, Tensor adj) { cudaSetDevice(adj->getDevice()); - + size_t m = context->shape()[0] * context->shape()[2] * context->shape()[3]; size_t k = context->shape()[1]; @@ -1208,16 +1232,208 @@ void AttBack(Tensor gContext, Tensor gState, Tensor gVa, int blocks = std::min(MAX_BLOCKS, (int) n); int threads = std::min(MAX_THREADS, (int) k); - gAttBack<<>>(gContext->data(), + gAttBack<<>>(gVa->data(), + gContext->data(), gState->data(), - gVa->data(), + gCoverage ? gCoverage->data() : nullptr, + va->data(), context->data(), state->data(), - va->data(), + coverage ? coverage->data() : nullptr, adj->data(), m, k, n); } +__global__ void gLNormalization(float* out, const float* in, const float* alpha, const float* beta, + int rows, int cols, float eps=1e-9) { + extern __shared__ float _share[]; + + for (int bid = 0; bid < rows; bid += gridDim.x) { + int j = bid + blockIdx.x; + if (j < rows) { + float* so = out + j * cols; + const float* sp = in + j * cols; + + float* _sum = _share + blockDim.x; + _sum[threadIdx.x] = 0.0f; + for (int tid = 0; tid < cols; tid += blockDim.x) { + int id = tid + threadIdx.x; + if (id < cols) { + _sum[threadIdx.x] += sp[id]; + } + } + __syncthreads(); + int len = blockDim.x; + while(len != 1) { + __syncthreads(); + int skip = (len + 1) >> 1; + if (threadIdx.x < (len >> 1)) { + _sum[threadIdx.x] += _sum[threadIdx.x + skip]; + } + len = (len + 1) >> 1; + } + __syncthreads(); + float mean = _sum[0] / cols; + __syncthreads(); + + float* _sqSum = _share + blockDim.x; + + _sqSum[threadIdx.x] = 0.0; + for (int tid = 0; tid < cols; tid += blockDim.x) { + int id = tid + threadIdx.x; + if(id < cols) { + float ex = sp[id] - mean; + _sqSum[threadIdx.x] += ex * ex; + } + } + __syncthreads(); + len = blockDim.x; + while(len != 1) { + __syncthreads(); + int skip = (len + 1) >> 1; + if(threadIdx.x < (len >> 1)) + _sqSum[threadIdx.x] += _sqSum[threadIdx.x + skip]; + len = (len + 1) >> 1; + } + __syncthreads(); + float sigma = sqrtf(eps + (_sqSum[0] / cols)); + __syncthreads(); + + for (int tid = 0; tid < cols; tid += blockDim.x) { + int id = tid + threadIdx.x; + if(id < cols) { + float t = alpha[id] * ((sp[id] - mean) / sigma); + if (beta != nullptr) + t += beta[id]; + so[id] = t; + } + } + } + } } + +void LayerNormalization(Tensor out, Tensor in, Tensor gamma, Tensor beta, float eps) { + cudaSetDevice(out->getDevice()); + + int rows = in->shape()[0] * in->shape()[2] * in->shape()[3]; + int cols = in->shape()[1]; + + int blocks = std::min(MAX_BLOCKS, (int)rows); + int threads = std::min(MAX_THREADS, (int)cols); + int shared = 2 * threads * sizeof(float); + + gLNormalization<<>>(out->data(), + in->data(), + gamma->data(), + beta ? beta->data() : nullptr, + rows, cols, eps); +} + +__global__ void gLayerNormalizationGrad(float* gradX, float* gradGamma, float* gradBeta, + float* adj, float* y, float* x, float* gamma, float* beta, + int rows, int cols, float eps=1e-9) { + extern __shared__ float shared[]; + + for (int bid = 0; bid < rows; bid += gridDim.x) { + int j = bid + blockIdx.x; + if (j < rows) { + float* sum_adj = shared; + float* sum_adj_x = shared + blockDim.x; + float* sum_x = shared + 2 * blockDim.x; + float* sum_sqr = shared + 3 * blockDim.x; + + const float* xRow = x + j * cols; + const float* yRow = y + j * cols; + const float* adjRow = adj + j * cols; + float* gradXRow = gradX + j * cols; + + sum_x[threadIdx.x] = 0.0f; + sum_adj[threadIdx.x] = 0.0f; + sum_adj_x[threadIdx.x] = 0.0f; + sum_sqr[threadIdx.x] = 0.0f; + + for (int tid = 0; tid < cols; tid += blockDim.x) { + int id = tid + threadIdx.x; + if (id < cols) { + sum_x[threadIdx.x] += xRow[id]; + sum_adj_x[threadIdx.x] += adjRow[id] * (yRow[id] - ((beta) ? beta[id] : 0)) / gamma[id]; + sum_adj[threadIdx.x] += adjRow[id]; + } + } + __syncthreads(); + int len = blockDim.x; + while(len != 1) { + __syncthreads(); + int skip = (len + 1) >> 1; + if (threadIdx.x < (len >> 1)) { + sum_x[threadIdx.x] += sum_x[threadIdx.x + skip]; + sum_adj[threadIdx.x] += sum_adj[threadIdx.x + skip]; + sum_adj_x[threadIdx.x] += sum_adj_x[threadIdx.x + skip]; + } + len = (len + 1) >> 1; + } + __syncthreads(); + float mean = sum_x[0] / cols; + __syncthreads(); + + for (int tid = 0; tid < cols; tid += blockDim.x) { + int id = tid + threadIdx.x; + if(id < cols) { + float ex = xRow[id] - mean; + sum_sqr[threadIdx.x] += ex * ex; + } + } + + __syncthreads(); + len = blockDim.x; + while(len != 1) { + __syncthreads(); + int skip = (len + 1) >> 1; + if(threadIdx.x < (len >> 1)) + sum_sqr[threadIdx.x] += sum_sqr[threadIdx.x + skip]; + len = (len + 1) >> 1; + } + __syncthreads(); + float sigma = sqrtf(eps + (sum_sqr[0] / cols)); + __syncthreads(); + + + for (int tid = 0; tid < cols; tid += blockDim.x) { + int id = tid + threadIdx.x; + if (id < cols) { + float grad_x = 0.0f; + float x_hat = (yRow[id] - ((beta) ? beta[id] : 0) ) / gamma[id]; + grad_x += cols * adjRow[id]; + grad_x -= sum_adj[0]; + grad_x -= sum_adj_x[0] * x_hat; + grad_x /= (cols * sigma); + + gradXRow[id] += gamma[id] * grad_x; + atomicAdd(gradGamma + id, adjRow[id] * x_hat); + if (beta) { + atomicAdd(gradBeta + id, adjRow[id]); + } + } + } + } + } +} + +void LayerNormalizationGrad(Tensor gradX, Tensor gradGamma, Tensor gradBeta, + Tensor adj, Tensor y, Tensor x, Tensor gamma, Tensor beta) { + cudaSetDevice(adj->getDevice()); + int rows = y->shape()[0] * y->shape()[2] * y->shape()[3]; + int cols = y->shape()[1]; + + int threads = std::min(MAX_THREADS, cols); + int blocks = std::min(MAX_BLOCKS, rows); + int shared = sizeof(float) * threads * 4; + + gLayerNormalizationGrad<<>> + (gradX->data(), gradGamma->data(), (gradBeta) ? gradBeta->data() : nullptr, + adj->data(), y->data(), x->data(), gamma->data(),(beta) ? beta->data() : nullptr, rows, cols); +} + +} // namespace marian diff --git a/src/kernels/tensor_operators.h b/src/kernels/tensor_operators.h index d1adfde9..99c0586d 100644 --- a/src/kernels/tensor_operators.h +++ b/src/kernels/tensor_operators.h @@ -45,8 +45,9 @@ __global__ void gAdd(Functor functor, Shape outShape, const float* in1, const Shape in1Shape, - const Shape full) { - + const Shape full, + float scale = 1.0) { + int outLength = outShape.elements(); bool same = outLength == full.elements() && outLength == in1Shape.elements(); @@ -61,7 +62,7 @@ __global__ void gAdd(Functor functor, int index = bid + blockDim.x * blockIdx.x + threadIdx.x; if(index < outLength) { if(same) { - out[index] += functor(in1[index]); + out[index] += functor(in1[index]) * scale; } else { outShape.dims(index, dims); @@ -83,15 +84,75 @@ __global__ void gAdd(Functor functor, } } if(sum) - out[index] += sum; + out[index] += sum * scale; } } } } +template +__global__ void gAdd1(Functor functor, + float* out, + Shape outShape, + const float* in1, + const Shape in1Shape, + const Shape full, + float scale = 1.0) { + + int rows = full[0] * full[2] * full[3]; + int cols = full[1]; + bool same = in1Shape.elements() == full.elements(); + + for(int bid = 0; bid < rows; bid += gridDim.x) { + int j = bid + blockIdx.x; + if(j < rows) { + extern __shared__ float _share[]; + float* _sum = _share + blockDim.x; + + if(same) { + const float* sp = in1 + j * cols; + _sum[threadIdx.x] = 0; + for(int tid = 0; tid < cols; tid += blockDim.x) { + int id = tid + threadIdx.x; + if (id < cols) { + _sum[threadIdx.x] += functor(sp[id]); + } + } + } + else { + int dims[4]; + _sum[threadIdx.x] = 0; + + for(int tid = 0; tid < cols; tid += blockDim.x) { + int id = tid + threadIdx.x; + if (id < cols) { + full.dims(j * cols + id, dims); + int in1Index = in1Shape.bindex(dims); + _sum[threadIdx.x] += functor(in1[in1Index]); + } + } + } + __syncthreads(); + int len = blockDim.x; + while(len != 1) { + __syncthreads(); + int skip = (len + 1) >> 1; + if (threadIdx.x < (len >> 1)) { + _sum[threadIdx.x] += _sum[threadIdx.x + skip]; + + } + len = (len + 1) >> 1; + } + __syncthreads(); + out[j] += _sum[0] * scale; + } + } +} + + template void Add(Functor functor, - Tensor out, Tensor in) { + Tensor out, Tensor in, float scale = 1.0) { cudaSetDevice(out->getDevice()); @@ -101,20 +162,36 @@ void Add(Functor functor, int length = out->shape().elements(); - int threads = std::min(MAX_THREADS, length); - int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); + if(full.elements() / length == full[1]) { + size_t m = full.elements() / length; + size_t k = full[1]; - gAdd<<>>(functor, - out->data(), out->shape(), - in->data(), in->shape(), - full); + int blocks = std::min(MAX_BLOCKS, (int) m); + int threads = std::min(MAX_THREADS, (int) k); + int shared = sizeof(float) * threads * 2; + + gAdd1<<>>(functor, + out->data(), out->shape(), + in->data(), in->shape(), + full, scale); + } + else { + + int threads = std::min(MAX_THREADS, length); + int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); + + gAdd<<>>(functor, + out->data(), out->shape(), + in->data(), in->shape(), + full, scale); + } } template void Reduce(Functor functor, - T1 out, T2 in) { + T1 out, T2 in, float scale = 1.0) { out->set(0); - Add(functor, out, in); + Add(functor, out, in, scale); } template @@ -125,7 +202,8 @@ __global__ void gAdd(Functor functor, const Shape in1Shape, const float* in2, const Shape in2Shape, - const Shape full) { + const Shape full, + float scale = 1.0) { int outLength = outShape.elements(); @@ -144,7 +222,7 @@ __global__ void gAdd(Functor functor, int index = bid + blockDim.x * blockIdx.x + threadIdx.x; if (index < outLength) { if(same) { - out[index] += functor(in1[index], in2[index]); + out[index] += functor(in1[index], in2[index]) * scale; } else { outShape.dims(index, dims); @@ -166,15 +244,80 @@ __global__ void gAdd(Functor functor, } } if(sum) - out[index] += sum; + out[index] += sum * scale; } } } } +template +__global__ void gAdd1(Functor functor, + float* out, + Shape outShape, + const float* in1, + const Shape in1Shape, + const float* in2, + const Shape in2Shape, + const Shape full, + float scale = 1.0) { + + int rows = full[0] * full[2] * full[3]; + int cols = full[1]; + bool same = in1Shape.elements() == full.elements() + && in2Shape.elements() == full.elements(); + + for(int bid = 0; bid < rows; bid += gridDim.x) { + int j = bid + blockIdx.x; + if(j < rows) { + extern __shared__ float _share[]; + float* _sum = _share + blockDim.x; + + if(same) { + const float* sp1 = in1 + j * cols; + const float* sp2 = in2 + j * cols; + _sum[threadIdx.x] = 0; + for(int tid = 0; tid < cols; tid += blockDim.x) { + int id = tid + threadIdx.x; + if (id < cols) { + _sum[threadIdx.x] += functor(sp1[id], sp2[id]); + } + } + } + else { + int dims[4]; + _sum[threadIdx.x] = 0; + + for(int tid = 0; tid < cols; tid += blockDim.x) { + int id = tid + threadIdx.x; + if (id < cols) { + full.dims(j * cols + id, dims); + int in1Index = in1Shape.bindex(dims); + int in2Index = in2Shape.bindex(dims); + _sum[threadIdx.x] += functor(in1[in1Index], in2[in2Index]); + } + } + } + __syncthreads(); + int len = blockDim.x; + while(len != 1) { + __syncthreads(); + int skip = (len + 1) >> 1; + if (threadIdx.x < (len >> 1)) { + _sum[threadIdx.x] += _sum[threadIdx.x + skip]; + + } + len = (len + 1) >> 1; + } + __syncthreads(); + out[j] += _sum[0] * scale; + } + } +} + + template void Add(Functor functor, - Tensor out, Tensor in1, Tensor in2) { + Tensor out, Tensor in1, Tensor in2, float scale = 1.0) { cudaSetDevice(out->getDevice()); @@ -186,23 +329,39 @@ void Add(Functor functor, int length = out->shape().elements(); - int threads = std::min(MAX_THREADS, length); - int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); + /* + if(full.elements() / length == full[1]) { + size_t m = full.elements() / length; + size_t k = full[1]; - gAdd<<>>(functor, - out->data(), out->shape(), - in1->data(), in1->shape(), - in2->data(), in2->shape(), - full); + int blocks = std::min(MAX_BLOCKS, (int) m); + int threads = std::min(MAX_THREADS, (int) k); + int shared = sizeof(float) * threads * 2; + gAdd1<<>>(functor, + out->data(), out->shape(), + in1->data(), in1->shape(), + in2->data(), in2->shape(), + full); + } + else {*/ + int threads = std::min(MAX_THREADS, length); + int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); + + gAdd<<>>(functor, + out->data(), out->shape(), + in1->data(), in1->shape(), + in2->data(), in2->shape(), + full, scale); + //} } template void Reduce(Functor functor, - Tensor out, Tensor in1, Tensor in2) { + Tensor out, Tensor in1, Tensor in2, float scale = 1.0) { out->set(0); - Add(functor, out, in1, in2); + Add(functor, out, in1, in2, scale); } @@ -680,9 +839,13 @@ void GRUFastBackward(std::vector outputs, std::vector inputs, Tensor adj, bool final = false); -void Att(Tensor out, Tensor context, Tensor state, Tensor va); -void AttBack(Tensor gContext, Tensor gState, Tensor gva, - Tensor context, Tensor state, Tensor va, +void Att(Tensor out, Tensor va, Tensor context, Tensor state, Tensor coverage); +void AttBack(Tensor gva, Tensor gContext, Tensor gState, Tensor gCoverage, + Tensor va, Tensor context, Tensor state, Tensor coverage, Tensor adj); +void LayerNormalization(Tensor out, Tensor in, Tensor gamma, Tensor beta, float eps=1e-9); +void LayerNormalizationGrad(Tensor gradX, Tensor gradGamma, Tensor gradBeta, + Tensor adj, Tensor y, Tensor x, Tensor gamma, Tensor beta); + } diff --git a/src/kernels/thrust_functions.h b/src/kernels/thrust_functions.h index a48a7f71..e0529364 100644 --- a/src/kernels/thrust_functions.h +++ b/src/kernels/thrust_functions.h @@ -48,6 +48,7 @@ namespace thrust return compose(unary_operator(), _1); } + template struct unary_log : public thrust::unary_function { __host__ __device__ @@ -166,6 +167,33 @@ namespace thrust make_actor(_1), make_actor(_2)); } + + template + struct binary_pow : public thrust::binary_function { + __host__ __device__ + T operator()(const T &x, const T &y) const { + float tx = x; + if(y == (int)y && (int)y % 2 == 0) + tx = abs(x); + return powf(tx, y); + } + }; + + template + __host__ __device__ + actor< + composite< + binary_operator, + actor, + typename as_actor::type + > + > + Pow(const actor &_1, const T2 &_2) + { + return compose(binary_operator(), + make_actor(_1), + make_actor(_2)); + } } } } diff --git a/src/layers/attention.h b/src/layers/attention.h index 6e52c5fd..1f55a5ac 100644 --- a/src/layers/attention.h +++ b/src/layers/attention.h @@ -2,6 +2,7 @@ #include "marian.h" #include "graph/expression_graph.h" +#include "layers/rnn.h" namespace marian { @@ -13,17 +14,19 @@ struct AttentionNodeOp : public NaryNodeOp { keywords::shape=newShape(nodes)) {} Shape newShape(const std::vector& nodes) { - Shape shape = nodes[0]->shape(); - Shape shape2 = nodes[1]->shape(); - Shape shape3 = nodes[2]->shape(); + Shape shape = nodes[1]->shape(); - for(int i = 0; i < shape2.size(); ++i) { - UTIL_THROW_IF2(shape[i] != shape2[i] && shape[i] != 1 && shape2[i] != 1, + Shape vaShape = nodes[0]->shape(); + Shape ctxShape = nodes[1]->shape(); + Shape stateShape = nodes[2]->shape(); + + for(int i = 0; i < stateShape.size(); ++i) { + UTIL_THROW_IF2(ctxShape[i] != stateShape[i] && ctxShape[i] != 1 && stateShape[i] != 1, "Shapes cannot be broadcasted"); - shape.set(i, std::max(shape[i], shape2[i])); + shape.set(i, std::max(ctxShape[i], stateShape[i])); } - UTIL_THROW_IF2(shape3[0] != shape[1] || shape3[1] != 1, + UTIL_THROW_IF2(vaShape[0] != shape[1] || vaShape[1] != 1, "Wrong size"); shape.set(1, 1); @@ -35,7 +38,8 @@ struct AttentionNodeOp : public NaryNodeOp { NodeOp(Att(val_, children_[0]->val(), children_[1]->val(), - children_[2]->val())) + children_[2]->val(), + children_.size() == 4 ? children_[3]->val() : nullptr)) }; } @@ -46,9 +50,11 @@ struct AttentionNodeOp : public NaryNodeOp { children_[0]->grad(), children_[1]->grad(), children_[2]->grad(), + children_.size() == 4 ? children_[3]->grad() : nullptr, children_[0]->val(), children_[1]->val(), children_[2]->val(), + children_.size() == 4 ? children_[3]->val() : nullptr, adj_ ); ) @@ -70,22 +76,33 @@ struct AttentionNodeOp : public NaryNodeOp { } }; -Expr attOps(Expr context, Expr state, Expr va) { - std::vector nodes{context, state, va}; +Expr attOps(Expr va, Expr context, Expr state, Expr coverage=nullptr) { + std::vector nodes{va, context, state}; + if(coverage) + nodes.push_back(coverage); + int dimBatch = context->shape()[0]; int dimWords = context->shape()[2]; + int dimBeam = state->shape()[3]; return reshape(Expression(nodes), - {dimWords, dimBatch}); + {dimWords, dimBatch, 1, dimBeam}); } class GlobalAttention { private: Expr Wa_, ba_, Ua_, va_; + Expr gammaContext_, betaContext_; + Expr gammaState_, betaState_; + Expr context_; Expr softmaxMask_; Expr mappedContext_; std::vector contexts_; + std::vector alignments_; + bool layerNorm_; + + Expr cov_; public: @@ -95,21 +112,34 @@ class GlobalAttention { int dimDecState, Args ...args) : context_(context), - softmaxMask_(nullptr) { + softmaxMask_(nullptr), + layerNorm_(Get(keywords::normalize, false, args...)), + cov_(Get(keywords::coverage, nullptr, args...)) { int dimEncState = context->shape()[1]; + auto graph = context->graph(); Wa_ = graph->param(prefix + "_W_comb_att", {dimDecState, dimEncState}, keywords::init=inits::glorot_uniform); - ba_ = graph->param(prefix + "_b_att", {1, dimEncState}, - keywords::init=inits::zeros); Ua_ = graph->param(prefix + "_Wc_att", {dimEncState, dimEncState}, keywords::init=inits::glorot_uniform); va_ = graph->param(prefix + "_U_att", {dimEncState, 1}, keywords::init=inits::glorot_uniform); + ba_ = graph->param(prefix + "_b_att", {1, dimEncState}, + keywords::init=inits::zeros); - mappedContext_ = affine(context_, Ua_, ba_); + if(layerNorm_) { + gammaContext_ = graph->param(prefix + "_att_gamma1", {1, dimEncState}, + keywords::init=inits::from_value(1.0)); + gammaState_ = graph->param(prefix + "_att_gamma2", {1, dimEncState}, + keywords::init=inits::from_value(1.0)); + + mappedContext_ = layer_norm(dot(context_, Ua_), gammaContext_, ba_); + } + else { + mappedContext_ = affine(context_, Ua_, ba_); + } auto softmaxMask = Get(keywords::mask, nullptr, args...); if(softmaxMask) { @@ -124,21 +154,23 @@ class GlobalAttention { int dimBatch = context_->shape()[0]; int srcWords = context_->shape()[2]; + int dimBeam = state->shape()[3]; auto mappedState = dot(state, Wa_); - auto attReduce = attOps(mappedContext_, mappedState, va_); + if(layerNorm_) + mappedState = layer_norm(mappedState, gammaState_); + + auto attReduce = attOps(va_, mappedContext_, mappedState); // @TODO: horrible -> - auto e = reshape( - transpose(softmax(transpose(attReduce), - softmaxMask_)), - {dimBatch, 1, srcWords}); + auto e = reshape(transpose(softmax(transpose(attReduce), softmaxMask_)), + {dimBatch, 1, srcWords, dimBeam}); // <- horrible - auto alignedSource = weighted_average(context_, e, - axis=2); + auto alignedSource = weighted_average(context_, e, axis=2); contexts_.push_back(alignedSource); + alignments_.push_back(e); return alignedSource; } diff --git a/src/layers/dropout.h b/src/layers/dropout.h deleted file mode 100644 index aedfa287..00000000 --- a/src/layers/dropout.h +++ /dev/null @@ -1,60 +0,0 @@ -#include -#include -#include -#include - -#include "tensors/tensor.h" - - -#define CUDA_CALL(x) do { if((x)!=cudaSuccess) { \ - printf("Error at %s:%d\n",__FILE__,__LINE__);\ - exit(1);}} while(0) - -#define CURAND_CALL(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \ - printf("Error at %s:%d\n",__FILE__,__LINE__);\ - exit(1);}} while(0) - - -__global__ -void gScalled(float* data, int n, float p) { - int index = threadIdx.x + blockIdx.x * blockDim.x; - - while (index < n) { - data[index] = (data[index] < p) / p; - index += gridDim.x * blockDim.x; - } -} - -namespace marian { - -class DropoutGenerator { - public: - DropoutGenerator(cudaStream_t stream=0, unsigned long long seed = 1234ULL) { - CURAND_CALL(curandCreateGenerator(&generator_, CURAND_RNG_PSEUDO_DEFAULT)); - CURAND_CALL(curandSetPseudoRandomGeneratorSeed(generator_, seed)); - CURAND_CALL(curandSetStream(generator_, stream)); - } - - void Generate(Tensor& tensor, float p) { - Generate(tensor->data(), tensor->size(), p); - } - - - void Generate(float* data, int n, float p) { - CURAND_CALL(curandGenerateUniform(generator_, data, n)); - int numThreads = std::min(n, 512); - int numBlocks = n / numThreads + (n % numThreads != 0); - - gScalled<<>>(data, n, p); - } - - ~DropoutGenerator() { - CURAND_CALL(curandDestroyGenerator(generator_)); - } - - private: - curandGenerator_t generator_; - -}; - -} diff --git a/src/layers/generic.h b/src/layers/generic.h index 65eea110..1d0256d4 100644 --- a/src/layers/generic.h +++ b/src/layers/generic.h @@ -28,6 +28,7 @@ namespace marian { private: int outDim_; act activation_; + bool layerNorm_; public: template @@ -38,18 +39,30 @@ namespace marian { outDim_(outDim), activation_(Get(keywords::activation, act::linear, - args...)) {} + args...)), + layerNorm_(Get(keywords::normalize, + false, args...)) {} Expr operator()(Expr in) { auto g = in->graph(); auto W = g->param(name_ + "_W", {in->shape()[1], outDim_}, keywords::init=inits::glorot_uniform); auto b = g->param(name_ + "_b", {1, outDim_}, - keywords::init=inits::zeros); + keywords::init=inits::zeros); params_ = { W, b }; - auto out = affine(in, W, b); + Expr out; + if(layerNorm_) { + auto gamma = g->param(name_ + "_gamma", {1, outDim_}, + keywords::init=inits::from_value(1.0)); + + params_.push_back(gamma); + out = layer_norm(dot(in, W), gamma, b); + } + else { + out = affine(in, W, b); + } switch (activation_) { case act::linear : @@ -81,13 +94,21 @@ namespace marian { {in->shape()[1], outDim_}, keywords::init=inits::glorot_uniform); auto b = g->param(name_ + "_b" + std::to_string(i), - {1, outDim_}, - keywords::init=inits::zeros); - + {1, outDim_}, + keywords::init=inits::zeros); params_.push_back(W); params_.push_back(b); - outputs.push_back(affine(in, W, b)); + if(layerNorm_) { + auto gamma = g->param(name_ + "_gamma" + std::to_string(i), {1, outDim_}, + keywords::init=inits::from_value(1.0)); + + params_.push_back(gamma); + outputs.push_back(layer_norm(dot(in, W), gamma, b)); + } + else { + outputs.push_back(affine(in, W, b)); + } i++; } @@ -145,7 +166,7 @@ namespace marian { auto mask = Get(keywords::mask, nullptr, args...); auto ce = cross_entropy(in, picks); - + if(mask) ce = ce * mask; diff --git a/src/layers/param_initializers.cpp b/src/layers/param_initializers.cpp index 64f4936b..08ca9bc9 100644 --- a/src/layers/param_initializers.cpp +++ b/src/layers/param_initializers.cpp @@ -1,23 +1,3 @@ -// This file is part of the Marian toolkit. -// Marian is copyright (c) 2016 Marcin Junczys-Dowmunt. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. #include #include @@ -44,9 +24,6 @@ float xor128() { return 0.1 * ((w % 1000) / 1000.f) - 0.05; } -// Use a constant seed for deterministic behaviour. -//std::default_random_engine engine(42); - void zeros(Tensor t) { t->set(0.f); } diff --git a/src/layers/param_initializers.h b/src/layers/param_initializers.h index 51b45944..f1fab867 100644 --- a/src/layers/param_initializers.h +++ b/src/layers/param_initializers.h @@ -1,26 +1,5 @@ #pragma once -// This file is part of the Marian toolkit. -// Marian is copyright (c) 2016 Marcin Junczys-Dowmunt. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. - #include #include #include "tensors/tensor.h" @@ -47,6 +26,7 @@ template void distribution(std::vector& vals, float a, float b) { std::random_device device; std::default_random_engine engine(device()); + engine.seed(1234); Distribution dist(a, b); auto gen = std::bind(dist, engine); diff --git a/src/layers/rnn.h b/src/layers/rnn.h index cff95028..0635748e 100644 --- a/src/layers/rnn.h +++ b/src/layers/rnn.h @@ -1,3 +1,5 @@ +#pragma once + #include #include #include @@ -10,7 +12,6 @@ #include "graph/expression_graph.h" #include "layers/generic.h" -#include "layers/attention.h" namespace marian { @@ -56,6 +57,7 @@ class Tanh { template class RNN : public Layer { public: + int dimInput_; int dimState_; dir direction_; bool outputLast_; @@ -63,15 +65,17 @@ class RNN : public Layer { Ptr cell_; template - RNN(const std::string& name, - int dimState, - Cell cell, - Args ...args) + RNN(Ptr graph, + const std::string& name, + int dimInput, + int dimState, + Args ...args) : Layer(name), + dimInput_{dimInput}, dimState_{dimState}, direction_{Get(keywords::direction, dir::forward, args...)}, outputLast_{Get(keywords::output_last, false, args...)}, - cell_(New(cell)) {} + cell_(New(graph, name_, dimInput_, dimState_, args...)) {} Ptr getCell() { return cell_; @@ -80,7 +84,6 @@ class RNN : public Layer { std::vector apply(const Expr input, const Expr initialState, const Expr mask = nullptr, bool reverse = false) { auto xW = cell_->apply1(input); - std::vector outputs; auto state = initialState; for(size_t i = 0; i < input->shape()[2]; ++i) { @@ -114,13 +117,11 @@ class RNN : public Layer { auto graph = input->graph(); int dimInput = input->shape()[1]; - cell_->initialize(graph, name_, dimInput, dimState_, args...); - Expr mask = Get(keywords::mask, nullptr, args...); if(direction_ == dir::backward) { auto states = apply(input, state, mask, true); - //std::reverse(states.begin(), states.end()); + std::reverse(states.begin(), states.end()); if(outputLast_) return states.back(); else @@ -139,63 +140,130 @@ class RNN : public Layer { } }; +template +class MLRNN : public Layer { + private: + int layers_; + bool skip_; + bool skipFirst_; + int dimState_; + std::vector>> rnns_; + + public: + + template + MLRNN(Ptr graph, + const std::string& name, + int layers, + int dimInput, + int dimState, + Args ...args) + : Layer(name), + layers_(layers), + skip_(Get(keywords::skip, false, args...)), + skipFirst_(Get(keywords::skip_first, false, args...)), + dimState_{dimState} { + for(int i = 0; i < layers; ++i) { + rnns_.push_back( + New>(graph, + name + "_l" + std::to_string(i), + i == 0 ? dimInput : dimState, + dimState, + args...) + ); + } + } + + template + std::tuple> + operator()(Expr input, Args ...args) { + Expr output; + std::vector outStates; + for(int i = 0; i < layers_; ++i) { + auto outState = (*rnns_[i])(input, args...); + outStates.push_back(outState); + + if(skip_ && (skipFirst_ || i > 0)) + output = outState + input; + else + output = outState; + + input = output; + } + return std::make_tuple(output, outStates); + } + + template + std::tuple> + operator()(Expr input, + std::vector states, + Args ...args) { + Expr output; + std::vector outStates; + for(int i = 0; i < layers_; ++i) { + auto outState = (*rnns_[i])(input, states[i], args...); + outStates.push_back(outState); + + if(skip_ && (skipFirst_ || i > 0)) + output = outState + input; + else + output = outState; + + input = output; + } + return std::make_tuple(output, outStates); + } +}; + template class BiRNN : public Layer { public: + int layers_; int dimState_; Ptr> rnn1_; Ptr> rnn2_; template - BiRNN(const std::string& name, - int dimState, - Cell cell1, - Cell cell2, - Args ...args) + BiRNN(Ptr graph, + const std::string& name, + int layers, + int dimInput, + int dimState, + Args ...args) : Layer(name), dimState_{dimState}, - rnn1_(New>(name, dimState, cell1, - keywords::direction=dir::forward, - args...)), - rnn2_(New>(name + "_r", dimState, cell2, - keywords::direction=dir::backward, - args...)) {} + rnn1_(New>(graph, name, layers, dimInput, dimState, + keywords::direction=dir::forward, + args...)), + rnn2_(New>(graph, name + "_r", layers, dimInput, dimState, + keywords::direction=dir::backward, + args...)) {} template - BiRNN(const std::string& name, - int dimState, - Args ...args) - : BiRNN(name, dimState, Cell(), Cell(), args...) {} + std::vector operator()(Expr input, Args ...args) { + Expr mask = Get(keywords::mask, nullptr, args...); + auto statesfw = (*rnn1_)(input); + auto statesbw = (*rnn2_)(input, keywords::mask=mask); - template - Expr operator()(Expr input, Args ...args) { - auto graph = input->graph(); - int dimBatch = input->shape()[0]; - auto startState = graph->zeros(keywords::shape={dimBatch, dimState_}); - return (*this)(input, startState, args...); + std::vector outStates; + for(int i = 0; i < layers_; ++i) + outStates.push_back(concatenate({statesfw[i], statesbw[i]}, + keywords::axis=1)); + return outStates; } template - Expr operator()(Expr input, Expr state, Args ...args) { + std::vector operator()(Expr input, std::vector states, Args ...args) { Expr mask = Get(keywords::mask, nullptr, args...); + auto statesfw = (*rnn1_)(input, states); + auto statesbw = (*rnn2_)(input, states, keywords::mask=mask); - auto graph = input->graph(); - int dimInput = input->shape()[1]; - - rnn1_->getCell()->initialize(graph, name_, dimInput, dimState_, args...); - auto states1 = rnn1_->apply(input, state, nullptr); - - rnn2_->getCell()->initialize(graph, name_ + "_r", dimInput, dimState_, args...); - auto states2 = rnn2_->apply(input, state, mask, true); - - std::reverse(states2.begin(), states2.end()); - std::vector states; - for(int i = 0; i < states1.size(); ++i) - states.push_back(concatenate({states1[i], states2[i]}, - keywords::axis=1)); - - return concatenate(states, keywords::axis=2); + std::vector outStates; + for(int i = 0; i < layers_; ++i) + outStates.push_back(concatenate({statesfw[i], statesbw[i]}, + keywords::axis=1)); + return outStates; } }; @@ -255,21 +323,32 @@ Expr gruOps(const std::vector& nodes, bool final = false) { return Expression(nodes, final); } +/***************************************************************/ + class GRU { private: + std::string prefix_; + Expr U_, W_, b_; + Expr gamma1_; + Expr gamma2_; + bool final_; + bool layerNorm_; + float dropout_; + + Expr dropMaskX_; + Expr dropMaskS_; public: - GRU() {} template - void initialize( - ExpressionGraphPtr graph, + GRU(ExpressionGraphPtr graph, const std::string prefix, int dimInput, int dimState, - Args ...args) { + Args ...args) : prefix_(prefix) { + auto U = graph->param(prefix + "_U", {dimState, 2 * dimState}, keywords::init=inits::glorot_uniform); auto W = graph->param(prefix + "_W", {dimInput, 2 * dimState}, @@ -288,19 +367,49 @@ class GRU { b_ = concatenate({b, bx}, keywords::axis=1); final_ = Get(keywords::final, false, args...); + layerNorm_ = Get(keywords::normalize, false, args...); + + dropout_ = Get(keywords::dropout_prob, 0.0f, args...); + + if(layerNorm_) { + gamma1_ = graph->param(prefix + "_gamma1", {1, 3 * dimState}, + keywords::init=inits::from_value(1.f)); + gamma2_ = graph->param(prefix + "_gamma2", {1, 3 * dimState}, + keywords::init=inits::from_value(1.f)); + } + + if(dropout_> 0.0f) { + dropMaskX_ = graph->dropout(dropout_, {1, dimInput}); + dropMaskS_ = graph->dropout(dropout_, {1, dimState}); + } } - Expr apply(Expr input, Expr state, Expr mask = nullptr) { + Expr apply(Expr input, Expr state, + Expr mask = nullptr) { return apply2(apply1(input), state, mask); } Expr apply1(Expr input) { + if(dropMaskX_) + input = dropout(input, keywords::mask=dropMaskX_); + debug(input, "in"); auto xW = dot(input, W_); + if(layerNorm_) + xW = layer_norm(xW, gamma1_); return xW; } - Expr apply2(Expr xW, Expr state, Expr mask = nullptr) { + Expr apply2(Expr xW, Expr state, + Expr mask = nullptr) { + if(dropMaskS_) + state = dropout(state, keywords::mask=dropMaskS_); + debug(state, "state"); + auto sU = dot(state, U_); + + if(layerNorm_) + sU = layer_norm(sU, gamma2_); + auto output = mask ? gruOps({state, xW, sU, b_, mask}, final_) : gruOps({state, xW, sU, b_}, final_); @@ -309,6 +418,7 @@ class GRU { } }; + /***************************************************************/ template @@ -320,31 +430,29 @@ class AttentionCell { public: - AttentionCell(Attention&& att) - : cell1_(New()), - cell2_(New()), - att_(New(att)) {} - - template - void initialize(Ptr graph, - const std::string prefix, - int dimInput, - int dimState, - Args ...args) + template + AttentionCell(Ptr graph, + const std::string prefix, + int dimInput, + int dimState, + Ptr att, + Args ...args) { - cell1_->initialize(graph, - prefix + "_cell1", - dimInput, - dimState, - keywords::final=false, - args...); + cell1_ = New(graph, + prefix + "_cell1", + dimInput, + dimState, + keywords::final=false, + args...); - cell2_->initialize(graph, - prefix + "_cell2", - att_->outputDim(), - dimState, - keywords::final=true, - args...); + att_ = New(att); + + cell2_ = New(graph, + prefix + "_cell2", + att_->outputDim(), + dimState, + keywords::final=true, + args...); } Expr apply(Expr input, Expr state, Expr mask = nullptr) { @@ -361,11 +469,17 @@ class AttentionCell { return cell2_->apply(alignedSourceContext, hidden, mask); } + Ptr getAttention() { + return att_; + } + Expr getContexts() { return concatenate(att_->getContexts(), keywords::axis=2); } + + Expr getLastContext() { + return att_->getContexts().back(); + } }; -typedef AttentionCell CGRU; - } diff --git a/src/marian.h b/src/marian.h index 7e4fecb6..ba646e0b 100644 --- a/src/marian.h +++ b/src/marian.h @@ -25,3 +25,5 @@ #include "graph/expression_graph.h" #include "graph/expression_operators.h" #include "layers/param_initializers.h" +#include "training/training.h" +#include "training/graph_group.h" diff --git a/src/models/nematus.h b/src/models/dl4mt.h similarity index 62% rename from src/models/nematus.h rename to src/models/dl4mt.h index d80e5a5d..5e07fba7 100644 --- a/src/models/nematus.h +++ b/src/models/dl4mt.h @@ -1,7 +1,7 @@ #pragma once #include "data/corpus.h" -#include "command/config.h" +#include "training/config.h" #include "graph/expression_graph.h" #include "layers/rnn.h" #include "layers/param_initializers.h" @@ -11,10 +11,12 @@ namespace marian { -class Nematus : public ExpressionGraph { +class DL4MT { private: Ptr options_; - + + Ptr> rnn_; + int dimSrcVoc_{40000}; int dimSrcEmb_{512}; int dimEncState_{1024}; @@ -25,6 +27,8 @@ class Nematus : public ExpressionGraph { int dimBatch_{64}; + bool normalize_; + void setDims(Ptr graph, Ptr batch) { dimSrcVoc_ = graph->get("Wemb") ? graph->get("Wemb")->shape()[0] : dimSrcVoc_; @@ -39,14 +43,14 @@ class Nematus : public ExpressionGraph { } public: - - Nematus() {} - - Nematus(Ptr options) + + DL4MT(Ptr options) : options_(options) { - + auto dimVocabs = options->get>("dim-vocabs"); - + + normalize_ = options->get("normalize"); + dimSrcVoc_ = dimVocabs[0]; dimSrcEmb_ = options->get("dim-emb"); dimEncState_ = options->get("dim-rnn"); @@ -56,16 +60,16 @@ class Nematus : public ExpressionGraph { dimBatch_ = options->get("mini-batch"); } - + void load(Ptr graph, const std::string& name) { using namespace keywords; LOG(info) << "Loading model from " << name; - + auto numpy = cnpy::npz_load(name); - auto parameters = { + std::vector parameters = { // Source word embeddings "Wemb", @@ -102,6 +106,20 @@ class Nematus : public ExpressionGraph { "ff_logit_W", "ff_logit_b", }; + std::vector parametersNorm = { + "decoder_att_gamma1", "decoder_att_gamma2", + "decoder_cell1_gamma1", "decoder_cell1_gamma2", + "decoder_cell2_gamma1", "decoder_cell2_gamma2", + "encoder_gamma1", "encoder_gamma2", + "encoder_r_gamma1", "encoder_r_gamma2", + "ff_logit_l1_gamma0", "ff_logit_l1_gamma1", + "ff_logit_l1_gamma2", "ff_state_gamma" + }; + + if(normalize_) + for(auto& p : parametersNorm) + parameters.push_back(p); + std::map nameMap = { {"decoder_U", "decoder_cell1_U"}, {"decoder_W", "decoder_cell1_W"}, @@ -129,6 +147,9 @@ class Nematus : public ExpressionGraph { }; for(auto name : parameters) { + UTIL_THROW_IF2(numpy.count(name) == 0, + "Parameter " << name << " does not exist."); + Shape shape; if(numpy[name].shape.size() == 2) { shape.set(0, numpy[name].shape[0]); @@ -152,7 +173,7 @@ class Nematus : public ExpressionGraph { const std::string& name) { LOG(info) << "Saving model to " << name; - + unsigned shape[2]; std::string mode = "w"; @@ -274,54 +295,148 @@ class Nematus : public ExpressionGraph { return std::make_tuple(y, yMask, yIdx); } + std::tuple encoder(Ptr graph, + Ptr batch) { + using namespace keywords; + + auto xEmb = Embedding("Wemb", dimSrcVoc_, dimSrcEmb_)(graph); + + Expr x, xMask; + std::tie(x, xMask) = prepareSource(xEmb, batch, 0); + + auto xfw = RNN(graph, "encoder", + dimSrcEmb_, dimEncState_, + normalize=normalize_, + direction=dir::forward)(x); + + auto xbw = RNN(graph, "encoder_r", + dimSrcEmb_, dimEncState_, + normalize=normalize_, + direction=dir::backward)(x, mask=xMask); + + auto xContext = concatenate({xfw, xbw}, axis=1); + + return std::make_tuple(xContext, xMask); + } + + std::tuple step(Expr hyps, + const std::vector hypIdx = {}, + const std::vector embIdx = {}) { + using namespace keywords; + auto graph = hyps->graph(); + + Expr selectedHyps, selectedEmbs; + if(embIdx.empty()) { + selectedHyps = hyps; + selectedEmbs = graph->constant(shape={1, dimTrgEmb_}, + init=inits::zeros); + } + else { + // @TODO : solve this better than reshaping! + selectedHyps = reshape(rows(hyps, hypIdx), + {1, hyps->shape()[1], 1, (int)hypIdx.size()}); + + auto yEmb = Embedding("Wemb_dec", dimTrgVoc_, dimTrgEmb_)(graph); + selectedEmbs = reshape(rows(yEmb, embIdx), + {1, yEmb->shape()[1], 1, (int)embIdx.size()}); + } + Expr newHyps, logits; + std::tie(newHyps, logits) = step(selectedHyps, selectedEmbs, true); + return std::make_tuple(newHyps, logsoftmax(logits)); + } + + std::tuple step(Expr yInStates, Expr yEmbeddings, + bool single = false) { + using namespace keywords; + + auto yOutStates = (*rnn_)(yEmbeddings, yInStates); + auto yCtx = single ? + rnn_->getCell()->getLastContext() : + rnn_->getCell()->getContexts(); + + //// 2-layer feedforward network for outputs and cost + auto yLogitsL1 = Dense("ff_logit_l1", dimTrgEmb_, + activation=act::tanh, + normalize=normalize_) + (yEmbeddings, yOutStates, yCtx); + + auto yLogitsL2 = Dense("ff_logit_l2", dimTrgVoc_) + (yLogitsL1); + + return std::make_tuple(yOutStates, yLogitsL2); + } + + Expr startState(Expr context, Expr mask) { + using namespace keywords; + + auto meanContext = weighted_average(context, mask, axis=2); + auto start = Dense("ff_state", + dimDecState_, + activation=act::tanh, + normalize=normalize_)(meanContext); + return start; + } + + Expr buildEncoder(Ptr graph, Ptr batch) { + using namespace keywords; + graph->clear(); + rnn_.reset(); + setDims(graph, batch); + + Expr xContext, xMask; + std::tie(xContext, xMask) = encoder(graph, batch); + + auto attention = New("decoder", + xContext, dimDecState_, + mask=xMask, normalize=normalize_); + rnn_ = New>(graph, "decoder", + dimTrgEmb_, dimDecState_, + attention, + normalize=normalize_); + + return startState(xContext, xMask); + } + + std::tuple embeddings(Ptr graph, + Ptr batch) { + using namespace keywords; + + auto yEmb = Embedding("Wemb_dec", dimTrgVoc_, dimTrgEmb_)(graph); + Expr y, yMask, yIdx; + std::tie(y, yMask, yIdx) = prepareTarget(yEmb, batch, 1); + auto yEmpty = graph->zeros(shape={dimBatch_, dimTrgEmb_}); + auto yShifted = concatenate({yEmpty, y}, axis=2); + + return std::make_tuple(yShifted, yMask, yIdx); + } + Expr build(Ptr graph, Ptr batch) { using namespace keywords; graph->clear(); - + rnn_.reset(); setDims(graph, batch); - // Embeddings - auto xEmb = Embedding("Wemb", dimSrcVoc_, dimSrcEmb_)(graph); - auto yEmb = Embedding("Wemb_dec", dimTrgVoc_, dimTrgEmb_)(graph); + Expr xContext, xMask; + std::tie(xContext, xMask) = encoder(graph, batch); + auto yStartStates = startState(xContext, xMask); - Expr x, xMask; - Expr y, yMask, yIdx; + Expr yEmbeddings, yMask, yIdx; + std::tie(yEmbeddings, yMask, yIdx) = embeddings(graph, batch); - std::tie(x, xMask) = prepareSource(xEmb, batch, 0); - std::tie(y, yMask, yIdx) = prepareTarget(yEmb, batch, 1); + auto attention = New("decoder", + xContext, dimDecState_, + mask=xMask, normalize=normalize_); + rnn_ = New>(graph, "decoder", + dimTrgEmb_, dimDecState_, + attention, + normalize=normalize_); - // Encoder - auto xContext = BiRNN("encoder", dimEncState_) - (x, mask=xMask); + Expr yOutStates, yLogits; + std::tie(yOutStates, yLogits) = step(yStartStates, yEmbeddings); - auto xMeanContext = weighted_average(xContext, xMask, axis=2); + auto cost = CrossEntropyCost("cost")(yLogits, yIdx, mask=yMask); - // Decoder - auto yStart = Dense("ff_state", - dimDecState_, - activation=act::tanh)(xMeanContext); - - auto yEmpty = graph->zeros(shape={dimBatch_, dimTrgEmb_}); - auto yShifted = concatenate({yEmpty, y}, axis=2); - //auto yShifted = shift(y, 1, axis=2); - - CGRU cgru({"decoder", xContext, dimDecState_, mask=xMask}); - auto yLstm = RNN("decoder", dimDecState_, cgru) - (yShifted, yStart); - auto yCtx = cgru.getContexts(); - - //// 2-layer feedforward network for outputs and cost - auto ff_logit_l1 = Dense("ff_logit_l1", dimTrgEmb_, - activation=act::tanh) - (yShifted, yLstm, yCtx); - - auto ff_logit_l2 = Dense("ff_logit_l2", dimTrgVoc_) - (ff_logit_l1); - - auto cost = CrossEntropyCost("cost") - (ff_logit_l2, yIdx, mask=yMask); - return cost; } }; diff --git a/src/models/encdec.h b/src/models/encdec.h new file mode 100644 index 00000000..0a494e4a --- /dev/null +++ b/src/models/encdec.h @@ -0,0 +1,210 @@ +#pragma once + +#include "data/corpus.h" +#include "training/config.h" +#include "graph/expression_graph.h" +#include "layers/rnn.h" +#include "layers/param_initializers.h" +#include "layers/generic.h" +#include "common/logging.h" + +namespace marian { + +class EncoderBase { + protected: + Ptr options_; + + virtual std::tuple + prepareSource(Expr emb, Ptr batch, size_t index) { + using namespace keywords; + std::vector indeces; + std::vector mask; + + for(auto& word : (*batch)[index]) { + for(auto i: word.first) + indeces.push_back(i); + for(auto m: word.second) + mask.push_back(m); + } + + int dimBatch = batch->size(); + int dimEmb = emb->shape()[1]; + int dimWords = (int)(*batch)[index].size(); + + auto graph = emb->graph(); + auto x = reshape(rows(emb, indeces), {dimBatch, dimEmb, dimWords}); + auto xMask = graph->constant(shape={dimBatch, 1, dimWords}, + init=inits::from_vector(mask)); + return std::make_tuple(x, xMask); + } + + public: + EncoderBase(Ptr options) + : options_(options) {} + + virtual std::tuple + build(Ptr, Ptr, size_t = 0) = 0; +}; + +class DecoderBase { + protected: + Ptr options_; + + virtual std::tuple + prepareTarget(Expr emb, Ptr batch, size_t index) { + using namespace keywords; + + std::vector indeces; + std::vector mask; + std::vector findeces; + + for(int j = 0; j < (*batch)[index].size(); ++j) { + auto& trgWordBatch = (*batch)[index][j]; + + for(auto i : trgWordBatch.first) { + findeces.push_back((float)i); + if(j < (*batch)[index].size() - 1) + indeces.push_back(i); + } + + for(auto m : trgWordBatch.second) + mask.push_back(m); + } + + int dimBatch = batch->size(); + int dimEmb = emb->shape()[1]; + int dimWords = (int)(*batch)[index].size(); + + auto graph = emb->graph(); + + auto y = reshape(rows(emb, indeces), + {dimBatch, dimEmb, dimWords - 1}); + + auto yMask = graph->constant(shape={dimBatch, 1, dimWords}, + init=inits::from_vector(mask)); + auto yIdx = graph->constant(shape={(int)findeces.size(), 1}, + init=inits::from_vector(findeces)); + + return std::make_tuple(y, yMask, yIdx); + } + + public: + DecoderBase(Ptr options) + : options_(options) {} + + virtual std::tuple + groundTruth(Ptr graph, + Ptr batch) { + using namespace keywords; + + int dimBatch = batch->size(); + int dimTrgVoc = options_->get>("dim-vocabs").back(); + int dimTrgEmb = options_->get("dim-emb"); + + auto yEmb = Embedding("Wemb_dec", dimTrgVoc, dimTrgEmb)(graph); + Expr y, yMask, yIdx; + std::tie(y, yMask, yIdx) = prepareTarget(yEmb, batch, 1); + auto yEmpty = graph->zeros(shape={dimBatch, dimTrgEmb}); + auto yShifted = concatenate({yEmpty, y}, axis=2); + + return std::make_tuple(yShifted, yMask, yIdx); + } + + virtual Expr + buildStartState(Expr context, Expr mask) { + using namespace keywords; + + auto meanContext = weighted_average(context, mask, axis=2); + + bool layerNorm = options_->get("normalize"); + auto start = Dense("ff_state", + options_->get("dim-rnn"), + activation=act::tanh, + normalize=layerNorm)(meanContext); + return start; + } + + virtual std::tuple> + step(Expr embeddings, std::vector states, + Expr context, Expr contextMask, bool single=false) = 0; +}; + +template +class Seq2Seq { + protected: + Ptr options_; + Ptr encoder_; + Ptr decoder_; + + public: + + Seq2Seq(Ptr options) + : options_(options), + encoder_(New(options)), + decoder_(New(options)) + {} + + virtual void load(Ptr graph, + const std::string& name) { + graph->load(name); + } + + virtual void save(Ptr graph, + const std::string& name) { + graph->save(name); + } + + virtual std::tuple, Expr, Expr> + buildEncoder(Ptr graph, + Ptr batch) { + using namespace keywords; + graph->clear(); + encoder_ = New(options_); + decoder_ = New(options_); + + Expr srcContext, srcMask; + std::tie(srcContext, srcMask) = encoder_->build(graph, batch); + auto startState = decoder_->buildStartState(srcContext, srcMask); + + size_t decoderLayers = options_->get("layers-dec"); + std::vector startStates(decoderLayers, startState); + + return std::make_tuple(startStates, srcContext, srcMask); + } + + virtual std::tuple> + step(Expr embeddings, + std::vector states, + Expr context, + Expr contextMask, + bool single=false) { + return decoder_->step(embeddings, states, context, contextMask, single); + } + + virtual Expr build(Ptr graph, + Ptr batch) { + using namespace keywords; + + std::vector startStates; + Expr srcContext, srcMask; + std::tie(startStates, srcContext, srcMask) = buildEncoder(graph, batch); + + Expr trgEmbeddings, trgMask, trgIdx; + std::tie(trgEmbeddings, trgMask, trgIdx) = decoder_->groundTruth(graph, batch); + + Expr trgLogits; + std::vector trgStates; + std::tie(trgLogits, trgStates) = decoder_->step(trgEmbeddings, + startStates, + srcContext, + srcMask); + + auto cost = CrossEntropyCost("cost")(trgLogits, trgIdx, + mask=trgMask); + + return cost; + } + +}; + +} diff --git a/src/models/feedforward.h b/src/models/feedforward.h deleted file mode 100644 index e2e6ad7b..00000000 --- a/src/models/feedforward.h +++ /dev/null @@ -1,119 +0,0 @@ -#pragma once - -#include "graph/expression_graph.h" - -namespace marian { - -/** - * @brief Namespace for code related to managing models in Marian - */ -namespace models { - -/** - * @brief Constructs an expression graph representing a feed-forward classifier. - * - * @param dims number of nodes in each layer of the feed-forward classifier - * - * @return a shared pointer to the newly constructed expression graph - */ -void FeedforwardClassifier(ExpressionGraphPtr g, - const std::vector& dims, - size_t batchSize, - bool training = true) { - using namespace keywords; - std::cerr << "Building Multi-layer Feedforward network" << std::endl; - std::cerr << "\tLayer dimensions:"; - for(auto d : dims) - std::cerr << " " << d; - std::cerr << std::endl; - boost::timer::cpu_timer timer; - - // Construct a shared pointer to an empty expression graph - g->clear(); - - // Construct an input node called "x" and add it to the expression graph. - // - // For each observed data point, this input will hold a vector of values describing that data point. - // dims.front() specifies the size of this vector - // - // For example, in the MNIST task, for any given image in the training set, - // "x" would hold a vector of pixel values for that image. - // - // Because calculating over one observed data point at a time can be inefficient, - // it is customary to operate over a batch of observed data points at once. - // - // At this point, we do not know the batch size: - // whatevs therefore serves as a placeholder for the batch size, which will be specified later - // - // Once the batch size is known, "x" will represent a matrix with dimensions [batch_size, dims.front()]. - // Each row of this matrix will correspond with the observed data vector for one observed data point. - auto x = name(g->input(shape={(int)batchSize, dims.front()}), "x"); - - // Construct an input node called "y" and add it to the expression graph. - // - // For each observed data point, this input will hold the ground truth label for that data point. - // dims.back() specifies the size of this vector - // - // For example, in the MNIST task, for any given image in the training set, - // "y" might hold one-hot vector representing which digit (0-9) is shown in that image - // - // Because calculating over one observed data point at a time can be inefficient, - // it is customary to operate over a batch of observed data points at once. - // - // At this point, we do not know the batch size: - // whatevs therefore serves as a placeholder for the batch size, which will be specified later - // - // Once the batch size is known, "y" will represent a matrix with dimensions [batch_size, dims.front()]. - // Each row of this matrix will correspond with the ground truth data vector for one observed data point. - auto y = name(g->input(shape={(int)batchSize, 1}), "y"); - - std::vector layers, weights, biases; - for(int i = 0; i < dims.size()-1; ++i) { - int in = dims[i]; - int out = dims[i+1]; - - if(i == 0) { - // Create a dropout node as the parent of x, - // and place that dropout node as the value of layers[0] - layers.emplace_back(dropout(x, value=0.2)); - } else { - // Multiply the matrix in layers[i-1] by the matrix in weights[i-1] - // Take the result, and perform matrix addition on biases[i-1]. - // Wrap the result in rectified linear activation function, - // and finally wrap that in a dropout node - layers.emplace_back(dropout(relu(affine(layers.back(), weights.back(), biases.back())), - value=0.5)); - } - - // Construct a weight node for the outgoing connections from layer i - weights.emplace_back( - g->param("W" + std::to_string(i), {in, out}, - init=inits::uniform())); - - // Construct a bias node. By definition, a bias node stores the value 1. - // Therefore, we don't actually store the 1. - // Instead, the bias node object stores the weights on the connections - // that are outgoing from the bias node. - // These weights are initialized to zero - biases.emplace_back( - g->param("b" + std::to_string(i), {1, out}, - init=inits::zeros)); - } - - // Perform matrix multiplication and addition for the last layer - auto last = affine(layers.back(), weights.back(), biases.back()); - - if(training) { - // Define a top-level node for training - auto cost = name(mean(cross_entropy(last, y), axis=0), "cost"); - } - else { - // Define a top-level node for inference - auto scores = name(softmax(last), "scores"); - } - - std::cerr << "\tTotal time: " << timer.format(5, "%ws") << std::endl; -}; - -} -} diff --git a/src/models/gnmt.h b/src/models/gnmt.h new file mode 100644 index 00000000..634f03e9 --- /dev/null +++ b/src/models/gnmt.h @@ -0,0 +1,153 @@ +#pragma once + +#include "models/encdec.h" +#include "layers/attention.h" + +namespace marian { + + typedef AttentionCell CGRU; + + class EncoderGNMT : public EncoderBase { + public: + EncoderGNMT(Ptr options) + : EncoderBase(options) {} + + std::tuple + build(Ptr graph, + Ptr batch, + size_t batchIdx = 0) { + + using namespace keywords; + + int dimSrcVoc = options_->get>("dim-vocabs")[batchIdx]; + int dimSrcEmb = options_->get("dim-emb"); + int dimEncState = options_->get("dim-rnn"); + bool layerNorm = options_->get("normalize"); + bool skipDepth = options_->get("skip"); + size_t encoderLayers = options_->get("layers-enc"); + float dropoutRnn = options_->get("dropout-rnn"); + + auto xEmb = Embedding("Wemb", dimSrcVoc, dimSrcEmb)(graph); + + Expr x, xMask; + std::tie(x, xMask) = prepareSource(xEmb, batch, batchIdx); + + auto xFw = RNN(graph, "encoder_bi", + dimSrcEmb, dimEncState, + normalize=layerNorm, + dropout_prob=dropoutRnn) + (x); + + auto xBw = RNN(graph, "encoder_bi_r", + dimSrcEmb, dimEncState, + normalize=layerNorm, + direction=dir::backward, + dropout_prob=dropoutRnn) + (x, mask=xMask); + + debug(xFw, "xFw"); + if(encoderLayers > 1) { + auto xBi = concatenate({xFw, xBw}, axis=1); + + Expr xContext; + std::vector states; + std::tie(xContext, states) + = MLRNN(graph, "encoder", encoderLayers - 1, + 2 * dimEncState, dimEncState, + normalize=layerNorm, + skip=skipDepth, + dropout_prob=dropoutRnn) + (xBi); + return std::make_tuple(xContext, xMask); + } + else { + auto xContext = concatenate({xFw, xBw}, axis=1); + return std::make_tuple(xContext, xMask); + } + } +}; + +class DecoderGNMT : public DecoderBase { + private: + Ptr attention_; + + public: + DecoderGNMT(Ptr options) + : DecoderBase(options) {} + + virtual std::tuple> + step(Expr embeddings, + std::vector states, + Expr context, + Expr contextMask, + bool single) { + using namespace keywords; + + int dimTrgVoc = options_->get>("dim-vocabs").back(); + int dimTrgEmb = options_->get("dim-emb"); + int dimDecState = options_->get("dim-rnn"); + bool layerNorm = options_->get("normalize"); + bool skipDepth = options_->get("skip"); + size_t decoderLayers = options_->get("layers-dec"); + float dropoutRnn = options_->get("dropout-rnn"); + + auto graph = embeddings->graph(); + + if(!attention_) + attention_ = New("decoder", + context, dimDecState, + mask=contextMask, + normalize=layerNorm); + RNN rnnL1(graph, "decoder", + dimTrgEmb, dimDecState, + attention_, + normalize=layerNorm, + dropout_prob=dropoutRnn); + auto stateL1 = rnnL1(embeddings, states[0]); + auto alignedContext = single ? + rnnL1.getCell()->getLastContext() : + rnnL1.getCell()->getContexts(); + + std::vector statesOut; + statesOut.push_back(stateL1); + + Expr outputLn; + if(decoderLayers > 1) { + std::vector statesIn; + for(int i = 1; i < states.size(); ++i) + statesIn.push_back(states[i]); + + std::vector statesLn; + std::tie(outputLn, statesLn) = MLRNN(graph, "decoder", + decoderLayers - 1, + dimDecState, dimDecState, + normalize=layerNorm, + dropout_prob=dropoutRnn, + skip=skipDepth, + skip_first=skipDepth) + (stateL1, statesIn); + + statesOut.insert(statesOut.end(), + statesLn.begin(), statesLn.end()); + } + else { + outputLn = stateL1; + } + + //// 2-layer feedforward network for outputs and cost + auto logitsL1 = Dense("ff_logit_l1", dimTrgEmb, + activation=act::tanh, + normalize=layerNorm) + (embeddings, outputLn, alignedContext); + + auto logitsL2 = Dense("ff_logit_l2", dimTrgVoc) + (logitsL1); + + return std::make_tuple(logitsL2, statesOut); + } + +}; + +typedef Seq2Seq GNMT; + +} diff --git a/src/optimizers/optimizers.h b/src/optimizers/optimizers.h index 01b4aebd..b1ea10d5 100644 --- a/src/optimizers/optimizers.h +++ b/src/optimizers/optimizers.h @@ -4,20 +4,18 @@ #include #include "kernels/tensor_operators.h" +#include "training/config.h" #include "optimizers/clippers.h" namespace marian { -// @TODO: modify computation graph to group all paramters in single matrix object. -// This will allow to perform a single large SGD update per batch. Currently there -// are as many updates as different parameters. - class OptimizerBase { public: template - OptimizerBase(Args... args) - : clipper_(Get(keywords::clip, nullptr, args...)) {} - + OptimizerBase(float eta, Args... args) + : clipper_(Get(keywords::clip, nullptr, args...)), + eta_(eta) {} + float backpropUpdate(Ptr graph) { graph->forward(); float cost = graph->topNode()->scalar(); @@ -29,43 +27,46 @@ class OptimizerBase { void update(Ptr graph) { Tensor p = graph->params().vals(); Tensor g = graph->params().grads(); - update(p, g); + update(p, g); } - + void update(Tensor params, Tensor grads) { if(clipper_) clipper_->clip(grads); updateImpl(params, grads); } - - private: - + + void updateSchedule() { + eta_ *= 0.5; + LOG(info) << "Changing learning rate to " << eta_; + } + + protected: + virtual void updateImpl(Tensor params, Tensor grads) = 0; - + Ptr clipper_; + float eta_; }; class Sgd : public OptimizerBase { public: template - Sgd(float eta=0.01, Args... args) - : OptimizerBase(args...), eta_(eta) {} + Sgd(float eta, Args... args) + : OptimizerBase(eta, args...) {} private: void updateImpl(Tensor params, Tensor grads) { Element(_1 -= eta_ * _2, params, grads); } - - float eta_; }; // @TODO: Add serialization for historic gradients and parameters class Adagrad : public OptimizerBase { public: template - Adagrad(float eta=0.01, Args ...args) - : OptimizerBase(args...), - eta_(eta), + Adagrad(float eta, Args ...args) + : OptimizerBase(eta, args...), eps_(Get(keywords::eps, 1e-8, args...)) {} @@ -80,7 +81,7 @@ class Adagrad : public OptimizerBase { alloc_->allocate(gt_, {1, totalSize}); gt_->set(0); } - + Element(_1 += (_2 * _2), gt_, grads); @@ -88,7 +89,6 @@ class Adagrad : public OptimizerBase { params, gt_, grads); } - float eta_; float eps_; Ptr alloc_; Tensor gt_; @@ -100,9 +100,8 @@ class Adagrad : public OptimizerBase { class Adam : public OptimizerBase { public: template - Adam(float eta = 0.0001, Args ...args) - : OptimizerBase(args...), - eta_(eta), + Adam(float eta, Args ...args) + : OptimizerBase(eta, args...), beta1_(Get(keywords::beta1, 0.9, args...)), beta2_(Get(keywords::beta2, 0.999, args...)), eps_(Get(keywords::eps, 1e-8, args...)), @@ -110,7 +109,7 @@ class Adam : public OptimizerBase { {} void updateImpl(Tensor params, Tensor grads) { - + if(!mtAlloc_) mtAlloc_ = New(params->getDevice()); if(!vtAlloc_) @@ -128,9 +127,9 @@ class Adam : public OptimizerBase { } t_++; - float denom1 = 1 - pow(beta1_, t_); - float denom2 = 1 - pow(beta2_, t_); - + float denom1 = 1 - std::pow(beta1_, t_); + float denom2 = 1 - std::pow(beta2_, t_); + Element(_1 = (beta1_ * _1) + ((1 - beta1_) * _2), mt_, grads); Element(_1 = (beta2_ * _1) + ((1 - beta2_) * (_2 * _2)), @@ -141,7 +140,6 @@ class Adam : public OptimizerBase { } private: - float eta_; float beta1_; float beta2_; float eps_; @@ -158,4 +156,29 @@ Ptr Optimizer(Args&& ...args) { return Ptr(new Algorithm(args...)); } +Ptr Optimizer(Ptr options) { + + Ptr clipper = nullptr; + float clipNorm = options->get("clip-norm"); + if(clipNorm > 0) + clipper = Clipper(clipNorm); + + float lrate = options->get("learn-rate"); + + std::string opt = options->get("optimizer"); + + if(opt == "sgd") { + return Optimizer(lrate, keywords::clip=clipper); + } + else if(opt == "adagrad") { + return Optimizer(lrate, keywords::clip=clipper); + } + else if(opt == "adam") { + return Optimizer(lrate, keywords::clip=clipper); + } + else { + UTIL_THROW2("Unknown optimizer: " << opt); + } +} + } diff --git a/src/tensors/tensor.cu b/src/tensors/tensor.cu index 19aec0cb..5c28066b 100644 --- a/src/tensors/tensor.cu +++ b/src/tensors/tensor.cu @@ -21,7 +21,6 @@ #include -#include #include #include @@ -87,7 +86,7 @@ void TensorBase::set(const std::vector &v) { void TensorBase::copyFrom(Tensor in) { cudaSetDevice(device_); - CUDA_CHECK(cudaMemcpy(data_, in->data(), in->size() * sizeof(float), + CUDA_CHECK(cudaMemcpy(data_ , in->data() , in->size() * sizeof(float), cudaMemcpyDefault)); cudaStreamSynchronize(0); } @@ -100,7 +99,8 @@ std::string TensorBase::debug() { for(int i = 1; i < shape_.size(); ++i) strm << "x" << shape_[i]; strm << " size=" << shape_.elements() - << " (" << shape_.elements() * sizeof(float) << "B)" << std::endl; + << " (" << shape_.elements() * sizeof(float) << "B)"; + strm << " device=" << device_ << std::endl; // values size_t totSize = shape_.elements(); @@ -109,81 +109,90 @@ std::string TensorBase::debug() { strm << std::fixed << std::setprecision(8) << std::setfill(' '); - for(size_t k = 0; k < shape()[2]; ++k) { - strm << "[ "; - if(shape()[0] > 10) { - for (size_t i = 0; i < shape()[0] && i < 3; ++i) { - if(i > 0) - strm << std::endl << " "; - for (size_t j = 0; j < shape()[1] && j < 3; ++j) { - strm << std::setw(12) - << values[ i * shape().stride(0) - + j * shape().stride(1) - + k * shape().stride(2) ] << " "; - } - if(shape()[1] > 3) - strm << "... "; - for (size_t j = shape()[1] - 3; j < shape()[1]; ++j) { - strm << std::setw(12) - << values[ i * shape().stride(0) - + j * shape().stride(1) - + k * shape().stride(2) ] << " "; - } - } - strm << std::endl << " ..."; - for (size_t i = shape()[0] - 3; i < shape()[0]; ++i) { - if(i > 0) - strm << std::endl << " "; - for (size_t j = 0; j < shape()[1] && j < 3; ++j) { - strm << std::setw(12) - << values[ i * shape().stride(0) - + j * shape().stride(1) - + k * shape().stride(2) ] << " "; - } - if(shape()[1] > 3) - strm << "... "; - for (size_t j = shape()[1] - 3; j < shape()[1]; ++j) { - strm << std::setw(12) - << values[ i * shape().stride(0) - + j * shape().stride(1) - + k * shape().stride(2) ] << " "; - } - } - } - else { - for (size_t i = 0; i < shape()[0] && i < 10; ++i) { - if(i > 0) - strm << std::endl << " "; - for (size_t j = 0; j < shape()[1] && j < 3; ++j) { - strm << std::setw(12) - << values[ i * shape().stride(0) - + j * shape().stride(1) - + k * shape().stride(2) ] << " "; - } - if(shape()[1] > 3) - strm << "... "; - for (size_t j = shape()[1] - 3; j < shape()[1]; ++j) { - strm << std::setw(12) - << values[ i * shape().stride(0) - + j * shape().stride(1) - + k * shape().stride(2) ] << " "; - } - } - } - strm << "]" << std::endl; + for(size_t l = 0; l < shape()[3]; ++l) { + for(size_t k = 0; k < shape()[2]; ++k) { + strm << "[ "; + if(shape()[0] > 10) { + for (size_t i = 0; i < shape()[0] && i < 3; ++i) { + if(i > 0) + strm << std::endl << " "; + for (size_t j = 0; j < shape()[1] && j < 3; ++j) { + strm << std::setw(12) + << values[ i * shape().stride(0) + + j * shape().stride(1) + + k * shape().stride(2) + + l * shape().stride(3) ] << " "; + } + if(shape()[1] > 3) + strm << "... "; + for (size_t j = shape()[1] - 3; j < shape()[1]; ++j) { + strm << std::setw(12) + << values[ i * shape().stride(0) + + j * shape().stride(1) + + k * shape().stride(2) + + l * shape().stride(3) ] << " "; + } + } + strm << std::endl << " ..."; + for (size_t i = shape()[0] - 3; i < shape()[0]; ++i) { + if(i > 0) + strm << std::endl << " "; + for (size_t j = 0; j < shape()[1] && j < 3; ++j) { + strm << std::setw(12) + << values[ i * shape().stride(0) + + j * shape().stride(1) + + k * shape().stride(2) + + l * shape().stride(3) ] << " "; + } + if(shape()[1] > 3) + strm << "... "; + for (size_t j = shape()[1] - 3; j < shape()[1]; ++j) { + strm << std::setw(12) + << values[ i * shape().stride(0) + + j * shape().stride(1) + + k * shape().stride(2) + + l * shape().stride(3) ] << " "; + } + } + } + else { + for (size_t i = 0; i < shape()[0] && i < 10; ++i) { + if(i > 0) + strm << std::endl << " "; + for (size_t j = 0; j < shape()[1] && j < 3; ++j) { + strm << std::setw(12) + << values[ i * shape().stride(0) + + j * shape().stride(1) + + k * shape().stride(2) + + l * shape().stride(3) ] << " "; + } + if(shape()[1] > 3) + strm << "... "; + for (size_t j = shape()[1] - 3; j < shape()[1]; ++j) { + strm << std::setw(12) + << values[ i * shape().stride(0) + + j * shape().stride(1) + + k * shape().stride(2) + + l * shape().stride(3) ] << " "; + } + } + } + strm << "]" << std::endl; + } } return strm.str(); } DeviceGPU::~DeviceGPU() { - cudaSetDevice(device_); - if(data_) - CUDA_CHECK(cudaFree(data_)); + cudaSetDevice(device_); + if(data_) + CUDA_CHECK(cudaFree(data_)); + cudaDeviceSynchronize(); } void DeviceGPU::reserve(size_t size) { cudaSetDevice(device_); - + UTIL_THROW_IF2(size < size_, "New size must be larger than old size"); if(data_) { diff --git a/src/tensors/tensor.h b/src/tensors/tensor.h index b61b2bde..78fa0660 100644 --- a/src/tensors/tensor.h +++ b/src/tensors/tensor.h @@ -25,9 +25,6 @@ #include #include #include -#ifdef CUDNN -#include -#endif #include "3rd_party/exception.h" #include "common/definitions.h" @@ -40,30 +37,14 @@ class TensorBase : public std::enable_shared_from_this { float* data_; Shape shape_; size_t device_; -#ifdef CUDNN - cudnnTensorDescriptor_t cudnnDesc_; -#endif public: TensorBase(float* data, Shape shape, size_t device) : data_(data), shape_(shape), device_(device) - { -#ifdef CUDNN - cudnnCreateTensorDescriptor(&cudnnDesc_); - cudnnSetTensor4dDescriptorEx(cudnnDesc_, CUDNN_DATA_FLOAT, - shape_[0], shape_[1], - shape_[2], shape_[3], - shape_.stride(0), shape_.stride(1), - shape_.stride(2), shape_.stride(3)); -#endif - } + {} ~TensorBase() - { -#ifdef CUDNN - cudnnDestroyTensorDescriptor(cudnnDesc_); -#endif - } + {} virtual void reset(float* data) { data_ = data; @@ -90,6 +71,10 @@ class TensorBase : public std::enable_shared_from_this { return device_; } + Tensor subtensor(int offset, int size){ + return Tensor(new TensorBase(data_ + offset, {1, size}, device_ )); + } + float get(size_t i); void set(size_t i, float value); @@ -102,12 +87,6 @@ class TensorBase : public std::enable_shared_from_this { void copyFrom(Tensor); -#ifdef CUDNN - cudnnTensorDescriptor_t& cudnn() { - return cudnnDesc_; - } -#endif - std::string debug(); }; diff --git a/src/tensors/tensor_allocator.h b/src/tensors/tensor_allocator.h index 62932ed8..35b51ccb 100644 --- a/src/tensors/tensor_allocator.h +++ b/src/tensors/tensor_allocator.h @@ -94,9 +94,14 @@ class TensorAllocator { gaps_.insert(lastGap_); } + ~TensorAllocator() { + clear(); + } + void reserve(size_t elements = 0) { float mult = elements / FLOATS + 1; - std::cerr << "Extending reserved space to " << mult * CHUNK << " MB" << std::endl; + LOG(memory) << "Extending reserved space to " + << mult * CHUNK << " MB (device " << device_.getDevice() << ")"; size_t old = device_.capacity(); float* oldStart = device_.data(); @@ -106,8 +111,8 @@ class TensorAllocator { void reserveExact(size_t elements = 0) { size_t mbytes = (elements * sizeof(float)) / MBYTE; - std::cerr << "Reserving space for " << elements - << " floats (" << mbytes << " MB)" << std::endl; + LOG(memory) << "Reserving space for " << elements + << " floats (" << mbytes << " MB, device " << device_.getDevice() << ")"; size_t old = device_.capacity(); float* oldStart = device_.data(); diff --git a/src/test/bn_test.cu b/src/test/bn_test.cu new file mode 100644 index 00000000..77587e79 --- /dev/null +++ b/src/test/bn_test.cu @@ -0,0 +1,99 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "layers/generic.h" +#include "marian.h" + +int main(int argc, char** argv) { + using namespace marian; + using namespace data; + using namespace keywords; + + auto options = New(argc, argv, false); + + int batchSize = 128; + + std::vector temp(batchSize * 3072); + std::vector temp2(3072 * 3072); + std::vector indeces(batchSize, 0.f); + + std::random_device rnd_device; + // Specify the engine and distribution. + std::mt19937 mersenne_engine(rnd_device()); + mersenne_engine.seed(1234); + std::uniform_real_distribution dist(-1.f, 1.f); + + auto gen = std::bind(dist, mersenne_engine); + std::generate(std::begin(temp), std::end(temp), gen); + std::generate(std::begin(temp2), std::end(temp2), gen); + + { + auto graph = New(); + graph->setDevice(0); + graph->reserveWorkspaceMB(128); + + auto x = graph->param("x", {batchSize, 3072}, init=inits::from_vector(temp)); + auto gamma = graph->param("gamma", {1, 3072}, init=inits::from_value(2.0)); + auto beta = graph->param("beta", {1, 3072}, init=inits::zeros); + + auto y = layer_norm(x, gamma, beta); + + auto yLogitsL1 = Dense("ff_logit_l1", 512, + activation=act::tanh, + normalize=true) + (y, y, y); + + auto yLogitsL2 = Dense("ff_logit_l2", 50000) + (yLogitsL1); + + auto idx = graph->constant(shape={(int)indeces.size(), 1}, + init=inits::from_vector(indeces)); + auto ce = cross_entropy(yLogitsL2, idx); + auto cost = mean(sum(ce, keywords::axis=2), keywords::axis=0); + + debug(x, "x"); + debug(gamma, "gamma"); + debug(beta, "beta"); + + graph->forward(); + graph->backward(); + } + + /*{ + auto graph = New(); + graph->setDevice(0); + graph->reserveWorkspaceMB(128); + + auto x = graph->param("x", {batchSize, 3072}, init=inits::from_vector(temp)); + auto gamma = graph->param("gamma", {1, 3072}, init=inits::from_value(2.0)); + auto beta = graph->param("beta", {1, 3072}, init=inits::zeros); + + auto y = layer_norm(x, gamma, beta); + + auto w = graph->param("w", {3072, 3072}, init=inits::from_vector(temp2)); + + auto y2 = tanh(layer_norm(dot(y, w), gamma, beta)); + + auto idx = graph->constant(shape={(int)indeces.size(), 1}, + init=inits::from_vector(indeces)); + auto ce = cross_entropy(y2, idx); + auto cost = mean(sum(ce, keywords::axis=2), keywords::axis=0); + + debug(x, "x"); + debug(gamma, "gamma"); + debug(beta, "beta"); + + graph->forward(); + graph->backward(); + }*/ + + return 0; +} diff --git a/src/test/dropout_test.cu b/src/test/dropout_test.cu index 3e4bad69..17fda988 100644 --- a/src/test/dropout_test.cu +++ b/src/test/dropout_test.cu @@ -6,68 +6,26 @@ #include #include -#include "tensors/tensor_allocator.h" -#include "tensors/tensor.h" -#include "kernels/tensor_operators.h" - -#include "layers/dropout.h" - -#include "kernels/dropout_cudnn.h" +#include "training/config.h" +#include "marian.h" +#include "layers/param_initializers.h" using namespace marian; +using namespace keywords; -int main() { - int cudaDevice = 0; - TensorAllocator* params = new TensorAllocator(cudaDevice); +int main(int argc, char** argv) { + auto c = New(argc, argv); - cublasHandle_t handle = create_handle(cudaDevice); - - int rows = 64; - int cols = 2048; - int layers = 64; - - std::cerr << "Number of elements in tensor: " << rows * cols * layers << std::endl; - int rep = 1000; - const float prob = 0.5f; - - Tensor dropoutMatrix; - params->allocate(dropoutMatrix, {rows, cols, layers}); - - DropoutGenerator dropout(0); - - cudaStreamSynchronize(0); - boost::timer::cpu_timer timer; - - for (int i = 0; i < rep;++i) { - dropout.Generate(dropoutMatrix, prob); + auto g = New(); + g->setDevice(0); + g->reserveWorkspaceMB(512); + for(int i = 0; i < 10; ++i) { + g->clear(); + auto mask = g->dropout(0.2, {10, 3072}); + debug(mask, "mask"); + g->forward(); } - cudaDeviceSynchronize(); - - std::cerr << "DropoutGenerator: " << rep << " repetitions: " << timer.format(5, "%ws") << std::endl; - - Tensor cudnnInTensor, cudnnOutTensor; - params->allocate(cudnnInTensor, {rows, cols, layers}); - params->allocate(cudnnOutTensor, {rows, cols, layers}); - - void* states_; - void* space_; - size_t spaceSize_; - cudnnDropoutDescriptor_t dropDesc_; - - CudnnDropoutPrepare(cudnnInTensor, prob, &dropDesc_, &space_, &spaceSize_, &states_, (size_t)1234); - cudaStreamSynchronize(0); - - cudaDeviceSynchronize(); - timer.start(); - for (int i = 0; i < rep; ++i) { - CudnnDropoutForward(dropDesc_, space_, spaceSize_, cudnnInTensor, cudnnOutTensor); - } - - cudaDeviceSynchronize(); - std::cerr << "CUDNN Dropout: " << rep << " repetitions: " << timer.format(5, "%ws") << std::endl; - - return 0; } diff --git a/src/test/marian_test.cu b/src/test/marian_test.cu index 529cfdb2..0d4fa378 100644 --- a/src/test/marian_test.cu +++ b/src/test/marian_test.cu @@ -7,38 +7,42 @@ #include #include "marian.h" +#include "training/config.h" #include "optimizers/optimizers.h" #include "optimizers/clippers.h" #include "data/batch_generator.h" #include "data/corpus.h" -#include "models/nematus.h" +#include "models/gnmt.h" int main(int argc, char** argv) { using namespace marian; using namespace data; + auto options = New(argc, argv, false); + std::vector files = - {"../test/mini.de", - "../test/mini.en"}; + {"../testln/mini.en", + "../testln/mini.de"}; std::vector vocab = - {"../test/vocab.de.json", - "../test/vocab.en.json"}; + {"../benchmark/marian32K/train.tok.true.bpe.en.json", + "../benchmark/marian32K/train.tok.true.bpe.de.json"}; - std::vector maxVocab = { 50000, 50000 }; + YAML::Node& c = options->get(); + c["train-sets"] = files; + c["vocabs"] = vocab; - auto corpus = DataSet(files, vocab, maxVocab, 50); - BatchGenerator bg(corpus, 10, 20); + auto corpus = DataSet(options); + BatchGenerator bg(corpus, options); auto graph = New(); - graph->setDevice(std::atoi(argv[1])); + graph->setDevice(1); - auto nematus = New(); - nematus->load(graph, "../test/model.npz"); + auto encdec = New(options); + encdec->load(graph, "../benchmark/marian32K/modelML6.200000.npz"); graph->reserveWorkspaceMB(128); - float sum = 0; boost::timer::cpu_timer timer; size_t batches = 1; for(int i = 0; i < 1; ++i) { @@ -47,39 +51,15 @@ int main(int argc, char** argv) { auto batch = bg.next(); batch->debug(); - auto costNode = nematus->build(graph, batch); - for(auto p : graph->params()) - debug(p, p->name()); + auto costNode = encdec->build(graph, batch); + //for(auto p : graph->params()) + //debug(p, p->name()); debug(costNode, "cost"); - graph->graphviz("debug.dot"); + //graph->graphviz("debug.dot"); graph->forward(); - graph->backward(); - - float cost = costNode->val()->scalar(); - sum += cost; - - if(batches % 100 == 0) { - std::cout << std::setfill(' ') - << "Epoch " << i - << " Update " << batches - << " Cost " << std::setw(7) << std::setprecision(6) << cost - << " UD " << timer.format(2, "%ws"); - - float seconds = std::stof(timer.format(5, "%w")); - float sentences = 100 * batch->size() / seconds; - - std::cout << " " << std::setw(5) - << std::setprecision(4) - << sentences - << " sentences/s" << std::endl; - timer.start(); - } - - - if(batches % 10000 == 0) - nematus->save(graph, "../test/model.marian." + std::to_string(batches) + ".npz"); + //graph->backward(); batches++; } diff --git a/src/test/marian_translate.cu b/src/test/marian_translate.cu new file mode 100644 index 00000000..63c40878 --- /dev/null +++ b/src/test/marian_translate.cu @@ -0,0 +1,247 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "marian.h" +#include "training/config.h" +#include "optimizers/optimizers.h" +#include "optimizers/clippers.h" +#include "data/batch_generator.h" +#include "data/corpus.h" +#include "models/gnmt.h" +#include "translator/nth_element.h" +#include "common/history.h" + + +namespace marian { + +template +class BeamSearch { + private: + Ptr builder_; + size_t beamSize_; + cudaStream_t stream_{0}; + + public: + BeamSearch(Ptr builder) + : builder_(builder), + beamSize_(12) + {} + + Beam toHyps(const std::vector keys, + const std::vector costs, + size_t vocabSize, + const Beam& beam) { + Beam newBeam; + for(int i = 0; i < keys.size(); ++i) { + int embIdx = keys[i] % vocabSize; + int hypIdx = keys[i] / vocabSize; + float cost = costs[i]; + + newBeam.push_back( + New(beam[hypIdx], embIdx, hypIdx, cost)); + } + return newBeam; + } + + Beam pruneBeam(const Beam& beam) { + Beam newBeam; + for(auto hyp : beam) { + if(hyp->GetWord() > 0) { + newBeam.push_back(hyp); + } + } + return newBeam; + } + + std::tuple, Expr> + step(std::vector hyps, + Expr srcContext, + Expr srcMask, + const std::vector hypIdx = {}, + const std::vector embIdx = {}) { + using namespace keywords; + auto graph = hyps[0]->graph(); + + // @TODO: not hard-coded! + int dimTrgEmb_ = 512; + int dimTrgVoc_ = 50000; + + std::vector selectedHyps; + Expr selectedEmbs; + if(embIdx.empty()) { + selectedHyps = hyps; + selectedEmbs = graph->constant(shape={1, dimTrgEmb_}, + init=inits::zeros); + } + else { + // @TODO : solve this better than reshaping! + for(auto h : hyps) + selectedHyps.push_back( + reshape(rows(h, hypIdx), {1, h->shape()[1], 1, (int)hypIdx.size()})); + + auto yEmb = Embedding("Wemb_dec", dimTrgVoc_, dimTrgEmb_)(graph); + selectedEmbs = reshape(rows(yEmb, embIdx), + {1, yEmb->shape()[1], 1, (int)embIdx.size()}); + } + + Expr logits; + std::vector newHyps; + std::tie(logits, newHyps) = builder_->step(selectedEmbs, + selectedHyps, + srcContext, + srcMask, + true); + return std::make_tuple(newHyps, logsoftmax(logits)); + } + + std::tuple, Expr> + step(std::vector hyps, + Expr srcContext, + Expr srcMask, + const Beam& beam) { + + std::vector hypIndeces; + std::vector embIndeces; + std::vector beamCosts; + + for(auto hyp : beam) { + hypIndeces.push_back(hyp->GetPrevStateIndex()); + embIndeces.push_back(hyp->GetWord()); + beamCosts.push_back(hyp->GetCost()); + } + + auto graph = hyps[0]->graph(); + auto costs = graph->constant(keywords::shape={1, 1, 1, (int)beamCosts.size()}, + keywords::init=inits::from_vector(beamCosts)); + + std::vector newHyps; + Expr probs; + std::tie(newHyps, probs) = step(hyps, + srcContext, + srcMask, + hypIndeces, + embIndeces); + probs = probs + costs; + return std::make_tuple(newHyps, probs); + } + + Ptr search(Ptr graph, + Ptr batch) { + + std::vector startStates; + Expr srcContext, srcMask; + std::tie(startStates, srcContext, srcMask) + = builder_->buildEncoder(graph, batch); + + size_t pos = 0; + auto history = New(0); + Beam beam(1, New()); + bool first = true; + bool final = false; + std::vector beamSizes(1, beamSize_); + auto nth = New(beamSize_, batch->size(), stream_); + + history->Add(beam); + + std::vector hyps; + Expr probs; + do { + + if(first) { + std::tie(hyps, probs) = step(startStates, + srcContext, + srcMask); + pos = graph->forward(); + } + else { + std::tie(hyps, probs) = step(hyps, + srcContext, + srcMask, + beam); + beamSizes[0] = beam.size(); + pos = graph->forward(pos); + } + + size_t dimTrgVoc = probs->shape()[1]; + + std::vector outKeys; + std::vector outCosts; + + for(int i = 0; i < probs->shape()[3]; i++) { + probs->val()->set(i * dimTrgVoc + 1, std::numeric_limits::lowest()); + } + + nth->getNBestList(beamSizes, probs->val(), + outCosts, outKeys, first); + first = false; + + beam = toHyps(outKeys, outCosts, dimTrgVoc, beam); + final = history->size() >= 3 * batch->words(); + history->Add(beam, final); + beam = pruneBeam(beam); + + } while(!beam.empty() && !final); + + return history; + } +}; + +} + +int main(int argc, char** argv) { + using namespace marian; + using namespace data; + + auto options = New(argc, argv, false); + + std::vector files = + {"../benchmark/marian32K/newstest2016.tok.true.bpe.en"}; + //{"../benchmark/marian32K/test.txt"}; + + std::vector vocab = + {"../benchmark/marian32K/train.tok.true.bpe.en.json"}; + + YAML::Node& c = options->get(); + c["train-sets"] = files; + c["vocabs"] = vocab; + + auto corpus = DataSet(options); + BatchGenerator bg(corpus, options); + + auto graph = New(); + graph->setDevice(1); + + auto target = New(); + target->load("../benchmark/marian32K/train.tok.true.bpe.de.json", 50000); + + auto encdec = New(options); + encdec->load(graph, "../benchmark/marian32K/modelML6.200000.npz"); + + graph->reserveWorkspaceMB(128); + + boost::timer::cpu_timer timer; + bg.prepare(false); + while(bg) { + auto batch = bg.next(); + auto search = New>(encdec); + auto history = search->search(graph, batch); + + auto results = history->NBest(1); + for(auto r : results) { + for(auto w : r.first) + if(w != 0) + std::cout << (*target)[w] << " "; + //std::cout << r.second->GetCost() << std::endl; + std::cout << std::endl; + } + } + std::cerr << timer.format(5, "%ws") << std::endl; + + return 0; + +} diff --git a/src/test/tensor_test.cu b/src/test/tensor_test.cu index 5ad967a3..a7851bfa 100644 --- a/src/test/tensor_test.cu +++ b/src/test/tensor_test.cu @@ -1,123 +1,86 @@ #include #include -//#include "tensors/tensor_allocator.h" -//#include "tensors/tensor_gpu.h" -//#include "kernels/tensor_operators.h" -//#include "kernels/thrust_functions.h" - -#include "data/corpus.h" -#include "data/batch_generator.h" +#include "tensors/tensor_allocator.h" +#include "tensors/tensor.h" +#include "kernels/tensor_operators.h" +#include "kernels/thrust_functions.h" +#include "common/logging.h" using namespace marian; - - int main() { + Logger memory{stderrLogger("memory", "[%Y-%m-%d %T] [memory] %v")}; - std::vector files = - {"../benchmark/train.tok.true.en", - "../benchmark/train.tok.true.en", - "../benchmark/train.tok.true.de"}; + Ptr params = New(0); - std::vector vocab = - {"../benchmark/train.tok.true.en.json", - "../benchmark/train.tok.true.en.json", - "../benchmark/train.tok.true.de.json"}; + cublasHandle_t handle = create_handle(0); - std::vector maxVocab = { 50000, 50000, 50000 }; + int words = 64; + int batch = 128; + int hidden = 4096; - using namespace data; - auto corpus = New(files, vocab, maxVocab, 50); - BatchGenerator bg(corpus, 64, 20); + Tensor mappedState; + params->allocate(mappedState, {batch, hidden, 1}); + mappedState->set(0.001); - bg.prepare(); + Tensor mappedContext; + params->allocate(mappedContext, {batch, hidden, words}); + mappedContext->set(0.001); + + Tensor va; + params->allocate(va, {hidden, 1}); + va->set(0.001); + + Tensor out1; + params->allocate(out1, {batch, hidden, words}); + out1->set(0); + + Tensor gMappedState; + params->allocate(gMappedState, {batch, hidden, 1}); + gMappedState->set(0); + + Tensor gMappedContext; + params->allocate(gMappedContext, {batch, hidden, words}); + gMappedContext->set(0.001); + + Tensor gVa; + params->allocate(gVa, {hidden, 1}); + va->set(0.001); + + Tensor gOut1; + params->allocate(gOut1, {batch, hidden, words}); + out1->set(0); + + Tensor out2; + params->allocate(out2, {batch, 1, words}); + out2->set(0); + + boost::timer::cpu_timer timer; + for(int i = 0; i < 5000; ++i) { + Element(_1 = Tanh(_2 + _3), out1, mappedState, mappedContext); + Prod(handle, out2, out1, va, false, false, 0); + Prod(handle, gOut1, out2, va, false, true, 1.0f); + Prod(handle, gVa, out1, out2, true, false, 1.0f); + Add(_1 * (1.f - (_2 *_2)), gMappedState, out1, out1); + Add(_1 * (1.f - (_2 *_2)), gMappedContext, out1, out1); + cudaStreamSynchronize(0); - size_t i = 0; - size_t samples = 0; - while(bg) { - auto batch = bg.next(); - if(i && i % 10000 == 0) - std::cerr << "[" << i << "/" << samples << "]" << std::endl; if(i % 100 == 0) - std::cerr << "."; - i++; - - samples += batch->size(); + std::cout << "." << std::flush; } + std::cout << timer.format(5, "%ws") << std::endl; - - - - //TensorAllocator params = newTensorAllocator(); - // - //cublasHandle_t handle = create_handle(); - // - //int words = 64; - //int batch = 128; - //int hidden = 4096; - // - //Tensor mappedState; - //params->allocate(mappedState, {batch, hidden, 1}); - //mappedState->set(0.001); - // - //Tensor mappedContext; - //params->allocate(mappedContext, {batch, hidden, words}); - //mappedContext->set(0.001); - // - //Tensor va; - //params->allocate(va, {hidden, 1}); - //va->set(0.001); - // - //Tensor out1; - //params->allocate(out1, {batch, hidden, words}); - //out1->set(0); - // - //Tensor gMappedState; - //params->allocate(gMappedState, {batch, hidden, 1}); - //gMappedState->set(0); - // - //Tensor gMappedContext; - //params->allocate(gMappedContext, {batch, hidden, words}); - //gMappedContext->set(0.001); - // - //Tensor gVa; - //params->allocate(gVa, {hidden, 1}); - //va->set(0.001); - // - //Tensor gOut1; - //params->allocate(gOut1, {batch, hidden, words}); - //out1->set(0); - // - //Tensor out2; - //params->allocate(out2, {batch, 1, words}); - //out2->set(0); - // - //boost::timer::cpu_timer timer; - //for(int i = 0; i < 5000; ++i) { - // Element(_1 = Tanh(_2 + _3), out1, mappedState, mappedContext); - // Prod(handle, out2, out1, va, false, false, 0); - // Prod(handle, gOut1, out2, va, false, true, 1.0f); - // Prod(handle, gVa, out1, out2, true, false, 1.0f); - // Add(_1 * (1.f - (_2 *_2)), gMappedState, out1, out1); - // Add(_1 * (1.f - (_2 *_2)), gMappedContext, out1, out1); - // cudaStreamSynchronize(0); - // - // if(i % 100 == 0) - // std::cout << "." << std::flush; - //} - //std::cout << timer.format(5, "%ws") << std::endl; - // - //boost::timer::cpu_timer timer2; - //for(int i = 0; i < 5000; ++i) { - // Att(out2, mappedContext, mappedState, va); - // AttBack(gMappedContext, gMappedState, gVa, - // mappedContext, mappedState, va, out2); - // cudaStreamSynchronize(0); - // if(i % 100 == 0) - // std::cout << "." << std::flush; - //} - //std::cout << timer2.format(5, "%ws") << std::endl; + boost::timer::cpu_timer timer2; + for(int i = 0; i < 5000; ++i) { + Att(va, out2, mappedContext, mappedState, nullptr); + AttBack(gVa, gMappedContext, gMappedState, nullptr, + va, mappedContext, mappedState, out2, nullptr); + cudaStreamSynchronize(0); + if(i % 100 == 0) + std::cout << "." << std::flush; + } + std::cout << timer2.format(5, "%ws") << std::endl; return 0; } diff --git a/src/command/config.cpp b/src/training/config.cpp similarity index 62% rename from src/command/config.cpp rename to src/training/config.cpp index d806f3e4..ecd58d7a 100644 --- a/src/command/config.cpp +++ b/src/training/config.cpp @@ -1,8 +1,10 @@ -#include "command/config.h" #include #include +#include +#include "training/config.h" #include "common/file_stream.h" +#include "common/logging.h" #define SET_OPTION(key, type) \ do { if(!vm_[key].defaulted() || !config_[key]) { \ @@ -14,6 +16,8 @@ do { if(vm_.count(key) > 0) { \ config_[key] = vm_[key].as(); \ }} while(0) +namespace marian { + bool Config::has(const std::string& key) const { return config_[key]; } @@ -26,6 +30,10 @@ const YAML::Node& Config::get() const { return config_; } +YAML::Node& Config::get() { + return config_; +} + void ProcessPaths(YAML::Node& node, const boost::filesystem::path& configPath, bool isPath) { using namespace boost::filesystem; std::set paths = {"model", "trainsets", "vocabs"}; @@ -68,24 +76,18 @@ void ProcessPaths(YAML::Node& node, const boost::filesystem::path& configPath, b } void Config::validate() const { - if (has("trainsets")) { - std::vector tmp = get>("trainsets"); - if (tmp.size() != 2) { - std::cerr << "No trainsets!" << std::endl; - exit(1); - } - } else { - std::cerr << "No trainsets!" << std::endl; - exit(1); + UTIL_THROW_IF2(!has("train-sets") + || get>("train-sets").empty(), + "No train sets given in config file or on command line"); + if(has("vocabs")) { + UTIL_THROW_IF2(get>("vocabs").size() != + get>("train-sets").size(), + "There should be as many vocabularies as training sets"); } - if (has("vocabs")) { - if (get>("vocabs").size() != 2) { - std::cerr << "No vocab files!" << std::endl; - exit(1); - } - } else { - std::cerr << "No vocab files!" << std::endl; - exit(1); + if(has("valid-sets")) { + UTIL_THROW_IF2(get>("valid-sets").size() != + get>("train-sets").size(), + "There should be as many validation sets as training sets"); } } @@ -122,7 +124,7 @@ void OutputRec(const YAML::Node node, YAML::Emitter& out) { } } -void Config::addOptions(int argc, char** argv) { +void Config::addOptions(int argc, char** argv, bool doValidate) { std::string configPath; namespace po = boost::program_options; @@ -133,50 +135,88 @@ void Config::addOptions(int argc, char** argv) { "Configuration file") ("model,m", po::value()->default_value("./model"), "Path prefix for model to be saved") - ("device,d", po::value>() - ->multitoken() - ->default_value(std::vector({0}), "0"), - "Use device(s) no. arg") ("init,i", po::value(), "Load weights from arg before training") ("overwrite", po::value()->default_value(false), "Overwrite model with following checkpoints") - ("trainsets,t", po::value>()->multitoken(), + ("train-sets,t", po::value>()->multitoken(), "Paths to training corpora: source target") ("vocabs,v", po::value>()->multitoken(), - "Paths to vocabulary files, have to correspond to --trainsets") + "Paths to vocabulary files have to correspond to --trainsets. " + "If this parameter is not supplied we look for vocabulary files " + "source.{yml,json} and target.{yml,json}. " + "If these files do not exists they are created.") + ("max-length", po::value()->default_value(50), + "Maximum length of a sentence in a training sentence pair") ("after-epochs,e", po::value()->default_value(0), "Finish after this many epochs, 0 is infinity") ("after-batches", po::value()->default_value(0), "Finish after this many batch updates, 0 is infinity") - ("disp-freq", po::value()->default_value(100), + ("disp-freq", po::value()->default_value(1000), "Display information every arg updates") - ("save-freq", po::value()->default_value(30000), + ("save-freq", po::value()->default_value(10000), "Save model file every arg updates") + ("no-shuffle", po::value()->zero_tokens()->default_value(false), + "Skip shuffling of training data before each epoch") ("workspace,w", po::value()->default_value(2048), "Preallocate arg MB of work space") + ("log", po::value(), + "Log training process information to file given by arg") ; - po::options_description hyper("Search options"); - hyper.add_options() - ("max-length", po::value()->default_value(50), - "Maximum length of a sentence in a training sentence pair") - ("mini-batch,b", po::value()->default_value(40), - "Size of mini-batch used during update") - ("maxi-batch", po::value()->default_value(20), - "Number of batches to preload for length-based sorting") - ("lrate,l", po::value()->default_value(0.0002), - "Learning rate for Adam algorithm") - ("clip-norm", po::value()->default_value(1.f), - "Clip gradient norm to arg (0 to disable)") + po::options_description valid("Validation set options"); + valid.add_options() + ("valid-sets", po::value>()->multitoken(), + "Paths to validation corpora: source target") + ("valid-freq", po::value()->default_value(10000), + "Validate model every arg updates") + ("valid-metrics", po::value>() + ->multitoken() + ->default_value(std::vector({"cross-entropy"}), + "cross-entropy"), + "Metric to use during validation: cross-entropy, perplexity. " + "Multiple metrics can be specified") + ("early-stopping", po::value()->default_value(10), + "Stop if the first validation metric does not improve for arg consecutive " + "validation steps") + ("valid-log", po::value(), + "Log validation scores to file given by arg") + ; + + po::options_description model("Model options"); + model.add_options() ("dim-vocabs", po::value>() ->multitoken() ->default_value(std::vector({50000, 50000}), "50000 50000"), "Maximum items in vocabulary ordered by rank") ("dim-emb", po::value()->default_value(512), "Size of embedding vector") ("dim-rnn", po::value()->default_value(1024), "Size of rnn hidden state") - ("no-shuffle", po::value()->zero_tokens()->default_value(false), - "Skip shuffling of training data before each epoch") + ("layers-enc", po::value()->default_value(8), "Number of encoder layers") + ("layers-dec", po::value()->default_value(8), "Number of decoder layers") + ("skip", po::value()->zero_tokens()->default_value(false), + "Use skip connections") + ("normalize", po::value()->zero_tokens()->default_value(false), + "Enable layer normalization") + ("dropout-rnn", po::value()->default_value(0), + "Scaling dropout along rnn layers and time (0 = no dropout)") + ; + + po::options_description opt("Optimizer options"); + opt.add_options() + ("mini-batch,b", po::value()->default_value(64), + "Size of mini-batch used during update") + ("maxi-batch", po::value()->default_value(100), + "Number of batches to preload for length-based sorting") + ("optimizer,o", po::value()->default_value("adam"), + "Optimization algorithm (possible values: sgd, adagrad, adam") + ("learn-rate,l", po::value()->default_value(0.0001), + "Learning rate") + ("clip-norm", po::value()->default_value(1.f), + "Clip gradient norm to arg (0 to disable)") + ("device,d", po::value>() + ->multitoken() + ->default_value(std::vector({0}), "0"), + "GPUs to use for training. Asynchronous SGD is used with multiple devices.") ; po::options_description configuration("Configuration meta options"); @@ -191,7 +231,9 @@ void Config::addOptions(int argc, char** argv) { po::options_description cmdline_options("Allowed options"); cmdline_options.add(general); - cmdline_options.add(hyper); + cmdline_options.add(valid); + cmdline_options.add(model); + cmdline_options.add(opt); cmdline_options.add(configuration); boost::program_options::variables_map vm_; @@ -223,14 +265,25 @@ void Config::addOptions(int argc, char** argv) { SET_OPTION("device", std::vector); SET_OPTION_NONDEFAULT("init", std::string); SET_OPTION("overwrite", bool); + SET_OPTION_NONDEFAULT("log", std::string); // SET_OPTION_NONDEFAULT("trainsets", std::vector); - if (!vm_["trainsets"].empty()) { - config_["trainsets"] = vm_["trainsets"].as>(); + if (!vm_["train-sets"].empty()) { + config_["train-sets"] = vm_["train-sets"].as>(); + } + if (!vm_["valid-sets"].empty()) { + config_["valid-sets"] = vm_["valid-sets"].as>(); } if (!vm_["vocabs"].empty()) { config_["vocabs"] = vm_["vocabs"].as>(); } + + SET_OPTION_NONDEFAULT("valid-sets", std::vector); + SET_OPTION("valid-freq", size_t); + SET_OPTION("valid-metrics", std::vector); + SET_OPTION("early-stopping", size_t); + SET_OPTION_NONDEFAULT("valid-log", std::string); + // SET_OPTION_NONDEFAULT("vocabs", std::vector); SET_OPTION("after-epochs", size_t); SET_OPTION("after-batches", size_t); @@ -242,14 +295,22 @@ void Config::addOptions(int argc, char** argv) { SET_OPTION("max-length", size_t); SET_OPTION("mini-batch", int); SET_OPTION("maxi-batch", int); - SET_OPTION("lrate", double); + SET_OPTION("optimizer", std::string); + SET_OPTION("learn-rate", double); SET_OPTION("clip-norm", double); SET_OPTION("dim-vocabs", std::vector); + + SET_OPTION("layers-enc", int); + SET_OPTION("layers-dec", int); SET_OPTION("dim-emb", int); SET_OPTION("dim-rnn", int); SET_OPTION("no-shuffle", bool); - - validate(); + SET_OPTION("normalize", bool); + SET_OPTION("dropout-rnn", float); + SET_OPTION("skip", bool); + + if(doValidate) + validate(); if (get("relative-paths") && !vm_["dump-config"].as()) ProcessPaths(config_, boost::filesystem::path{configPath}.parent_path(), false); @@ -263,9 +324,17 @@ void Config::addOptions(int argc, char** argv) { } -void Config::logOptions() { - std::stringstream ss; +void Config::log() { + createLoggers(*this); + YAML::Emitter out; OutputRec(config_, out); - std::cerr << "Options: \n" << out.c_str() << std::endl; + std::string conf = out.c_str(); + + std::vector results; + boost::algorithm::split(results, conf, boost::is_any_of("\n")); + for(auto &r : results) + LOG(config) << r; +} + } diff --git a/src/command/config.h b/src/training/config.h similarity index 68% rename from src/command/config.h rename to src/training/config.h index 22220a19..8ce0cf58 100644 --- a/src/command/config.h +++ b/src/training/config.h @@ -1,12 +1,17 @@ #pragma once -#include #include +#include "3rd_party/yaml-cpp/yaml.h" +#include "common/logging.h" + +namespace marian { + class Config { public: - Config(int argc, char** argv) { - addOptions(argc, argv); + Config(int argc, char** argv, bool validate = true) { + addOptions(argc, argv, validate); + log(); } bool has(const std::string& key) const; @@ -19,12 +24,14 @@ class Config { } const YAML::Node& get() const; + YAML::Node& get(); + YAML::Node operator[](const std::string& key) const { return get(key); } - void addOptions(int argc, char** argv); - void logOptions(); + void addOptions(int argc, char** argv, bool validate); + void log(); void validate() const; template @@ -37,3 +44,5 @@ class Config { std::string inputPath; YAML::Node config_; }; + +} diff --git a/src/parallel/graph_group.h b/src/training/graph_group.h similarity index 53% rename from src/parallel/graph_group.h rename to src/training/graph_group.h index b3f76e4b..7fd8af59 100644 --- a/src/parallel/graph_group.h +++ b/src/training/graph_group.h @@ -1,242 +1,267 @@ #pragma once #include +#include #include "common/definitions.h" #include "3rd_party/threadpool.h" +#include "optimizers/optimizers.h" +#include "training/training.h" +#include "training/validator.h" namespace marian { - -class Reporter { - public: - Ptr options_; - - float costSum{0}; - size_t epochs{1}; - - size_t samples{0}; - size_t wordsDisp{0}; - size_t batches{0}; - - boost::timer::cpu_timer timer; - - public: - Reporter(Ptr options) : options_(options) {} - - void update(float cost, Ptr batch) { - static std::mutex sMutex; - std::lock_guard guard(sMutex); - - costSum += cost; - samples += batch->size(); - wordsDisp += batch->words(); - batches++; - //if(options.get("after-batches") - // && batches >= options.get("after-batches")) - // break; - - if(batches % options_->get("disp-freq") == 0) { - std::stringstream ss; - ss << "Ep. " << epochs - << " : Up. " << batches - << " : Sen. " << samples - << " : Cost " << std::fixed << std::setprecision(2) - << costSum / options_->get("disp-freq") - << " : Time " << timer.format(2, "%ws"); - - float seconds = std::stof(timer.format(5, "%w")); - float wps = wordsDisp / (float)seconds; - - ss << " : " << std::fixed << std::setprecision(2) - << wps << " words/s"; - - LOG(info) << ss.str(); - - timer.start(); - costSum = 0; - wordsDisp = 0; - } - } -}; class GraphGroup { protected: Ptr options_; Ptr reporter_; Ptr opt_; - + std::vector> graphs_; - + public: GraphGroup(Ptr options) - : options_(options) { - - Ptr clipper = nullptr; - float clipNorm = options_->get("clip-norm"); - float lrate = options_->get("lrate"); - if(clipNorm > 0) - clipper = Clipper(clipNorm); - - opt_ = Optimizer(lrate, - keywords::clip=clipper); - } - + : options_(options), opt_(Optimizer(options)) { } + virtual void update(Ptr) = 0; - + virtual void setReporter(Ptr reporter) { - reporter_ = reporter; + reporter_ = reporter; } - + + virtual void load() = 0; + virtual void save() = 0; }; template -class AsynchronousGraphGroup : public GraphGroup { +class AsyncGraphGroup : public GraphGroup { private: - Ptr builder_; - + std::vector> builders_; + std::vector devices_; - ThreadPool pool_; - + std::vector> graphs_; - + std::mutex sync_; - - Tensor params_; - Ptr paramsAlloc_; - - Tensor grads_; - Ptr gradsAlloc_; - + std::vector shardSync_; + + std::vector params_; + std::vector > paramsAlloc_; + + std::vector grads_; + std::vector> gradsAlloc_; + + std::vector> shardOpt_; + + int shardSize_; + + ThreadPool pool_; + void fetchParams(Tensor oldParams) { if(graphs_.size() < 2) return; - + // @TODO read guard on parameters - std::lock_guard guard(sync_); - oldParams->copyFrom(params_); + int pos = 0; + + std::vector threads; + for (int idx = 0; idx < devices_.size(); idx++) { + threads.emplace_back( std::thread( [=](int idx, int pos) { + //individual mutex per-shard + std::lock_guard guard( shardSync_[idx] ); + oldParams->subtensor(pos , params_[idx]->size())->copyFrom(params_[idx]); + }, idx, pos) ); + + pos += shardSize_; + } + for (auto &&t : threads) { + t.join(); + } } - + void pushGradients(Tensor newGrads) { if(graphs_.size() < 2) { opt_->update(graphs_[0]); } else { - std::lock_guard guard(sync_); - grads_->copyFrom(newGrads); - opt_->update(params_, grads_); + // add instead of copy? + std::vector threads; + int pos = 0; + for (int idx = 0; idx < devices_.size(); idx++) { + threads.emplace_back( std::thread([=](int idx, int pos) { + //individual mutex per-shard + std::lock_guard guard( shardSync_[idx] ); + grads_[idx]->copyFrom( newGrads->subtensor(pos , grads_[idx]->size() ) ); + shardOpt_[idx]->update(params_[idx], grads_[idx]); + + cudaStreamSynchronize(0); + } , idx, pos) ); + + pos += shardSize_; + } + for(auto&& t : threads) + t.join(); } } - + void execute(Ptr batch) { static bool first = true; if(first && graphs_.size() > 1) { - // initialize the paramters - for(auto graph : graphs_) { - builder_->build(graph, batch); - graph->forward(); + // initialize the parameters + for(size_t i = 0; i < graphs_.size(); ++i) { + builders_[i]->build(graphs_[i], batch); + graphs_[i]->forward(); } - - if(!params_) { - paramsAlloc_ = New(graphs_[0]->getDevice()); - + + if(params_.size() == 0) { int totalSize = graphs_[0]->params().vals()->size(); - paramsAlloc_->reserveExact(totalSize); - paramsAlloc_->allocate(params_, {1, totalSize}); + shardSize_ = ceil(totalSize / devices_.size()); + + int pos = 0; + //parameter sharding + for (auto device : devices_){ + int __size__ = min(shardSize_, totalSize); + totalSize -= __size__; + Tensor param_; + Ptr allocator_ = New(device); + + allocator_->reserveExact(__size__); + allocator_->allocate(param_, {1, __size__}); + paramsAlloc_.push_back(allocator_); + param_->copyFrom( graphs_[0]->params().vals()->subtensor( pos , __size__ ) ); + params_.push_back(param_); + pos += __size__; + + } } - - if(!grads_) { - gradsAlloc_ = New(graphs_[0]->getDevice()); - + if(grads_.size() == 0) { int totalSize = graphs_[0]->params().vals()->size(); - gradsAlloc_->reserveExact(totalSize); - gradsAlloc_->allocate(grads_, {1, totalSize}); + + for (auto device : devices_){ + int __size__ = min(shardSize_, totalSize); + totalSize -= __size__; + Tensor grad_; + Ptr allocator_ = New(device); + + allocator_->reserveExact(__size__); + allocator_->allocate(grad_, {1, __size__}); + gradsAlloc_.push_back(allocator_); + grads_.push_back(grad_); + + } } - - params_->copyFrom(graphs_[0]->params().vals()); + first = false; } - + auto task = [this](Ptr batch) { static size_t i = 0; thread_local Ptr graph; + thread_local Ptr builder; + thread_local size_t t = 0; + if(!graph) { std::lock_guard lock(sync_); - graph = graphs_[i++]; + graph = graphs_[i]; + builder = builders_[i++]; } - - builder_->build(graph, batch); - + + builder->build(graph, batch); fetchParams(graph->params().vals()); - + graph->forward(); float cost = graph->topNode()->scalar(); graph->backward(); - + + cudaStreamSynchronize(0); pushGradients(graph->params().grads()); - + if(reporter_) { + std::lock_guard guard(sync_); reporter_->update(cost, batch); if(reporter_->batches % options_->get("save-freq") == 0) this->save(); + size_t prevStalled = reporter_->stalled(); + reporter_->validate(graph); + if(prevStalled < reporter_->stalled()) + for(auto opt : shardOpt_) + opt->updateSchedule(); } + + t++; }; - + pool_.enqueue(task, batch); - } - - public: - AsynchronousGraphGroup(Ptr options) - : GraphGroup(options), - builder_{New(options_)}, - devices_{options_->get>("device")}, - pool_{devices_.size(), devices_.size() } { - - for(auto device : devices_) { - graphs_.emplace_back(New()); - graphs_.back()->setDevice(device); - graphs_.back()->reserveWorkspaceMB(options_->get("workspace")); + } + + void load() { + if(options_->has("init")) { + std::string init = options_->get("init"); + size_t i = 0; + for(auto graph : graphs_) + builders_[i++]->load(graph, init); } } - + + public: + typedef Builder builder_type; + + AsyncGraphGroup(Ptr options) + : GraphGroup(options), + devices_{options_->get>("device")}, + pool_{devices_.size(), devices_.size()}, + shardSync_{devices_.size()} { + + for(auto device : devices_) { + auto graph = New(); + graph->setDevice(device); + graph->reserveWorkspaceMB(options_->get("workspace")); + graphs_.push_back(graph); + shardOpt_.push_back(Optimizer(options_)); + builders_.push_back(New(options_)); + } + + load(); + } + void update(Ptr batch) { execute(batch); } - + void save() { - std::lock_guard guard(sync_); if(options_->get("overwrite")) { std::string name = options_->get("model") + ".npz"; - builder_->save(graphs_[0], name); + builders_[0]->save(graphs_[0], name); } else { std::string name = options_->get("model") + "." + std::to_string(reporter_->batches) + ".npz"; - builder_->save(graphs_[0], name); + builders_[0]->save(graphs_[0], name); } } }; - + template -class SynchronousGraphGroup : public GraphGroup { +class SyncGraphGroup : public GraphGroup { private: Ptr builder_; std::vector> batches_; - + bool first_{true}; - + void accumulateGradients(Ptr master, std::vector> graphs) { if(graphs_.size() < 2) { return; } - + Tensor grads = master->params().grads(); Tensor tempGrads; master->tensor(tempGrads, grads->shape()); - + for(auto graph : graphs) { if(graph != master) { Tensor remoteGrads = graph->params().grads(); @@ -244,24 +269,24 @@ class SynchronousGraphGroup : public GraphGroup { Element(_1 += _2, grads, tempGrads); } } - + float denom = graphs_.size(); Element(_1 /= denom, grads); } - + void distributeParameters(Ptr master, std::vector> graphs) { if(graphs_.size() < 2) return; - - Tensor params = master->params().vals(); + + Tensor params = master->params().vals(); for(auto graph : graphs) { if(graph != master) { graph->params().vals()->copyFrom(params); } } } - + void execute() { if(first_) { for(auto graph : graphs_) { @@ -271,66 +296,77 @@ class SynchronousGraphGroup : public GraphGroup { distributeParameters(graphs_[0], graphs_); first_ = false; } - + auto task = [this](int i, Ptr batch) { thread_local int j = -1; if(j == -1) j = i; auto localGraph = this->graphs_[j]; - + builder_->build(localGraph, batch); localGraph->forward(); float cost = localGraph->topNode()->scalar(); localGraph->backward(); - + if(reporter_) { reporter_->update(cost, batch); if(reporter_->batches % options_->get("save-freq") == 0) this->save(); } }; - + { size_t workers = graphs_.size(); ThreadPool pool(workers, workers); - + for(int i = 0; i < batches_.size(); ++i) pool.enqueue(task, i % (int)workers, batches_[i]); - } + } accumulateGradients(graphs_[0], graphs_); opt_->update(graphs_[0]); distributeParameters(graphs_[0], graphs_); - + batches_.clear(); } - + + void load() { + if(options_->has("init")) { + std::string init = options_->get("init"); + for(auto graph : graphs_) + builder_->load(graph, init); + } + } + public: - SynchronousGraphGroup(Ptr options) + typedef Builder builder_type; + + SyncGraphGroup(Ptr options) : GraphGroup(options), builder_{New(options_)} { - + auto devices = options_->get>("device"); size_t workers = devices.size(); - + for(auto device : devices) { graphs_.emplace_back(New()); graphs_.back()->setDevice(device); graphs_.back()->reserveWorkspaceMB(options_->get("workspace")); } - + + load(); } - - ~SynchronousGraphGroup() { + + ~SyncGraphGroup() { execute(); } - + void update(Ptr batch) { batches_.push_back(batch); if(batches_.size() == graphs_.size()) execute(); } - + void save() { if(options_->get("overwrite")) { std::string name = options_->get("model") + ".npz"; @@ -342,6 +378,7 @@ class SynchronousGraphGroup : public GraphGroup { builder_->save(graphs_[0], name); } } + }; - -} \ No newline at end of file + +} diff --git a/src/training/training.h b/src/training/training.h new file mode 100644 index 00000000..10c27d41 --- /dev/null +++ b/src/training/training.h @@ -0,0 +1,152 @@ +#pragma once + +#include "data/batch_generator.h" +#include "data/corpus.h" +#include "training/config.h" +#include "training/validator.h" + +namespace marian { + +class Reporter { + public: + Ptr options_; + std::vector> validators_; + + float costSum{0}; + size_t epochs{1}; + + size_t samples{0}; + size_t wordsDisp{0}; + size_t batches{0}; + + boost::timer::cpu_timer timer; + + public: + Reporter(Ptr options) : options_(options) {} + + bool keepGoing() { + // stop if it reached the maximum number of epochs + if(options_->get("after-epochs") > 0 + && epochs > options_->get("after-epochs")) + return false; + + // stop if it reached the maximum number of batch updates + if(options_->get("after-batches") > 0 + && batches >= options_->get("after-batches")) + return false; + + // stop if the first validator did not improve for a given number of checks + if(options_->get("early-stopping") > 0 + && !validators_.empty() + && validators_[0]->stalled() >= options_->get("early-stopping")) + return false; + + return true; + } + + void increaseEpoch() { + LOG(info) << "Seen " << samples << " samples"; + + epochs++; + samples = 0; + + LOG(info) << "Starting epoch " << epochs; + } + + void finished() { + LOG(info) << "Training finshed"; + } + + void addValidator(Ptr validator) { + validators_.push_back(validator); + } + + void validate(Ptr graph) { + if(batches % options_->get("valid-freq") == 0) { + for(auto validator : validators_) { + if(validator) { + size_t stalledPrev = validator->stalled(); + float value = validator->validate(graph); + std::stringstream ss; + ss << batches << " : "; + ss << validator->type() << " : " << value; + if(validator->stalled() > 0) + ss << " : stalled " << validator->stalled() << " times"; + else + ss << " : new best"; + LOG(valid) << ss.str(); + } + } + } + } + + size_t stalled() { + for(auto validator : validators_) + if(validator) + return validator->stalled(); + return 0; + } + + void update(float cost, Ptr batch) { + costSum += cost; + samples += batch->size(); + wordsDisp += batch->words(); + batches++; + + if(batches % options_->get("disp-freq") == 0) { + std::stringstream ss; + ss << "Ep. " << epochs + << " : Up. " << batches + << " : Sen. " << samples + << " : Cost " << std::fixed << std::setprecision(2) + << costSum / options_->get("disp-freq") + << " : Time " << timer.format(2, "%ws"); + + float seconds = std::stof(timer.format(5, "%w")); + float wps = wordsDisp / (float)seconds; + + ss << " : " << std::fixed << std::setprecision(2) + << wps << " words/s"; + + LOG(info) << ss.str(); + + timer.start(); + costSum = 0; + wordsDisp = 0; + } + } +}; + +template +void Train(Ptr options) { + using namespace data; + using namespace keywords; + + auto trainCorpus = New(options); + auto batchGenerator = New>(trainCorpus, + options); + auto reporter = New(options); + + if(options->has("valid-sets") && options->get("valid-freq") > 0) { + for(auto validator : Validators(trainCorpus->getVocabs(), + options)) + reporter->addValidator(validator); + } + + auto model = New(options); + model->setReporter(reporter); + + while(reporter->keepGoing()) { + batchGenerator->prepare(!options->get("no-shuffle")); + while(*batchGenerator && reporter->keepGoing()) { + auto batch = batchGenerator->next(); + model->update(batch); + } + if(reporter->keepGoing()) + reporter->increaseEpoch(); + } + reporter->finished(); + model->save(); +} + +} diff --git a/src/training/validator.h b/src/training/validator.h new file mode 100644 index 00000000..7a58fa02 --- /dev/null +++ b/src/training/validator.h @@ -0,0 +1,146 @@ +#pragma once + + #include + +#include "training/config.h" +#include "graph/expression_graph.h" +#include "data/corpus.h" +#include "data/batch_generator.h" + +namespace marian { + + class Validator { + protected: + Ptr options_; + std::vector> vocabs_; + float lastBest_; + size_t stalled_{0}; + + public: + Validator(std::vector> vocabs, + Ptr options) + : options_(options), + vocabs_(vocabs), + lastBest_{lowerIsBetter() ? + std::numeric_limits::max() : + std::numeric_limits::lowest() } { + } + + virtual std::string type() = 0; + + virtual bool lowerIsBetter() { + return true; + } + + size_t stalled() { + return stalled_; + } + + float validate(Ptr graph) { + using namespace data; + auto validPaths = options_->get>("valid-sets"); + auto corpus = New(validPaths, vocabs_, options_); + Ptr> batchGenerator + = New>(corpus, options_); + batchGenerator->prepare(false); + + float val = validate(graph, batchGenerator); + if(lowerIsBetter() && lastBest_ > val || + !lowerIsBetter() && lastBest_ < val) { + stalled_ = 0; + lastBest_ = val; + } + else { + stalled_++; + } + return val; + }; + + virtual float validate(Ptr, + Ptr>) = 0; + + }; + + template + class CrossEntropyValidator : public Validator { + private: + Ptr builder_; + + public: + CrossEntropyValidator(std::vector> vocabs, + Ptr options) + : Validator(vocabs, options), + builder_(New(options)) {} + + float validate(Ptr graph, + Ptr> batchGenerator) { + float cost = 0; + size_t samples = 0; + + while(*batchGenerator) { + auto batch = batchGenerator->next(); + builder_->build(graph, batch); + graph->forward(); + + cost += graph->topNode()->scalar() * batch->size(); + samples += batch->size(); + } + + return cost / samples; + } + + std::string type() { return "cross-entropy"; } + }; + + template + class PerplexityValidator : public Validator { + private: + Ptr builder_; + + public: + PerplexityValidator(std::vector> vocabs, + Ptr options) + : Validator(vocabs, options), + builder_(New(options)) {} + + float validate(Ptr graph, + Ptr> batchGenerator) { + float cost = 0; + size_t words = 0; + + while(*batchGenerator) { + auto batch = batchGenerator->next(); + builder_->build(graph, batch); + graph->forward(); + + cost += graph->topNode()->scalar() * batch->size(); + words += batch->words(); + } + + return expf(cost / words); + } + + std::string type() { return "perplexity"; } + + }; + + template + std::vector> Validators(std::vector> vocabs, + Ptr options) { + std::vector> validators; + + auto validMetrics = options->get>("valid-metrics"); + for(auto metric : validMetrics) { + if(metric == "cross-entropy") { + auto validator = New>(vocabs, options); + validators.push_back(validator); + } + if(metric == "perplexity") { + auto validator = New>(vocabs, options); + validators.push_back(validator); + } + } + return validators; + } + +} diff --git a/src/translator/nth_element.cu b/src/translator/nth_element.cu new file mode 100644 index 00000000..6d0ab77a --- /dev/null +++ b/src/translator/nth_element.cu @@ -0,0 +1,360 @@ +#include + +#include "translator/nth_element.h" + +namespace marian { + +void HandleError(cudaError_t err, const char *file, int line ) { + if (err != cudaSuccess) { + std::cerr << "ERROR: " << cudaGetErrorString(err) << " in " << file << " at line " << line << std::endl; + exit( EXIT_FAILURE ); + } +} + +#define UNROLL_MAXARG_LOOP( n, max ) \ + if (tid < (n) && tid + (n) < ( max ) ) { \ + if (sdata[tid + ( n ) ] > sdata[tid]) { \ + sdata[tid] = sdata[tid + ( n ) ]; \ + indices[tid] = indices[tid + ( n ) ]; \ + } \ + } + +#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ )) + +__global__ void gMaxElement(float* d_out, int* d_ind, float* d_in, int numBatches, int* batchFirstElementIdxs) { + extern __shared__ float sdata[]; + __shared__ int indices[512]; + + int tid = threadIdx.x; + + for (int batchIdx = 0; batchIdx < numBatches; ++batchIdx) { + int begin = batchFirstElementIdxs[batchIdx]; + int end = batchFirstElementIdxs[batchIdx + 1]; + + int i = begin + blockIdx.x * (blockDim.x * 2) + tid; + + sdata[tid] = -3.40282e+38f; + + if (i < end) { + sdata[tid] = d_in[i]; + indices[tid] = i; + } + + if (i + blockDim.x < end) { + float a = d_in[i]; + float b = d_in[i + blockDim.x]; + if (a > b) { + sdata[tid] = a; + indices[tid] = i; + } else { + sdata[tid] = b; + indices[tid] = i + blockDim.x; + } + } + + while (i + 2 * gridDim.x * blockDim.x < end) { + i += 2 * gridDim.x * blockDim.x; + + float a = d_in[i]; + if (a > sdata[tid]) { + sdata[tid] = a; + indices[tid] = i; + } + + if (i + blockDim.x < end) { + float b = d_in[i + blockDim.x]; + if (b > sdata[tid]) { + sdata[tid] = b; + indices[tid] = i + blockDim.x; + } + } + } + + __syncthreads(); + + for (int s = (blockDim.x >> 1); s > 32; s >>= 1) { + if (tid < s && tid + s < end) { + if (sdata[tid + s] > sdata[tid]) { + sdata[tid] = sdata[tid + s]; + indices[tid] = indices[tid + s]; + } + } + __syncthreads(); + } + + UNROLL_MAXARG_LOOP(32, end); + UNROLL_MAXARG_LOOP(16, end); + UNROLL_MAXARG_LOOP(8, end); + UNROLL_MAXARG_LOOP(4, end); + UNROLL_MAXARG_LOOP(2, end); + UNROLL_MAXARG_LOOP(1, end); + + if (tid == 0) { + d_out[blockIdx.x + batchIdx * gridDim.x] = sdata[0]; + d_ind[blockIdx.x + batchIdx * gridDim.x] = indices[0]; + } + __syncthreads(); + } +} + +__global__ void gMaxElementUpdate(float* binCosts, int* binIdxs, float* probs, int *batchFirstElements, float* outCosts, int* outIdxs, int *cummulatedBeamSizes, int NUM_BLOCKS) { + extern __shared__ float sdata[]; + __shared__ int indices[512]; + __shared__ float bestBinCost; + __shared__ int bestBinCostIdx; + + const int tid = threadIdx.x; + const int batchIdx = blockIdx.x; + const int N = batchFirstElements[batchIdx + 1] - batchFirstElements[batchIdx]; + int num_bins = int(N / (2 * 512)) + int(N % (2 * 512) != 0); + if (num_bins > 500) { + num_bins = 500; + } + + for (int pos = cummulatedBeamSizes[batchIdx]; pos < cummulatedBeamSizes[batchIdx + 1]; ++pos) { + int i = tid; + + sdata[tid] = -3.40282e+38f; + + if (i < num_bins) { + sdata[tid] = binCosts[batchIdx * NUM_BLOCKS + i]; + indices[tid] = i; + } + + if (i + blockDim.x < num_bins) { + float a = binCosts[batchIdx * NUM_BLOCKS + i]; + float b = binCosts[batchIdx * NUM_BLOCKS + i + blockDim.x]; + if (a > b) { + sdata[tid] = a; + indices[tid] = i; + } else { + sdata[tid] = b; + indices[tid] = i + blockDim.x; + } + } + + while (i + 2 * blockDim.x < num_bins) { + i += 2 * blockDim.x; + + float a = binCosts[batchIdx * NUM_BLOCKS + i]; + if (a > sdata[tid]) { + sdata[tid] = a; + indices[tid] = i; + } + + if (i + blockDim.x < num_bins) { + float b = binCosts[batchIdx * NUM_BLOCKS + i + blockDim.x]; + if (b > sdata[tid]) { + sdata[tid] = b; + indices[tid] = i + blockDim.x; + } + } + } + + __syncthreads(); + + for (int s = (blockDim.x >> 1); s > 32; s >>= 1) { + if (tid < s && tid + s < num_bins) { + if (sdata[tid + s] > sdata[tid]) { + sdata[tid] = sdata[tid + s]; + indices[tid] = indices[tid + s]; + } + } + __syncthreads(); + } + + UNROLL_MAXARG_LOOP(32, num_bins); + UNROLL_MAXARG_LOOP(16, num_bins); + UNROLL_MAXARG_LOOP(8, num_bins); + UNROLL_MAXARG_LOOP(4, num_bins); + UNROLL_MAXARG_LOOP(2, num_bins); + UNROLL_MAXARG_LOOP(1, num_bins); + + if (tid == 0) { + bestBinCost = sdata[0]; + bestBinCostIdx = batchIdx * NUM_BLOCKS + indices[0]; + + probs[binIdxs[bestBinCostIdx]] = -3.40282e+38f; + + outIdxs[pos] = binIdxs[bestBinCostIdx]; + outCosts[pos] = bestBinCost; + } + + __syncthreads(); + + i = batchFirstElements[batchIdx] + (bestBinCostIdx - batchIdx * NUM_BLOCKS) * (blockDim.x * 2) + tid; + const int dist = num_bins * 2 * blockDim.x; + + sdata[tid] = -3.40282e+38f; + + if (i < batchFirstElements[batchIdx + 1]) { + sdata[tid] = probs[i]; + indices[tid] = i; + } + + if (i + blockDim.x < batchFirstElements[batchIdx + 1]) { + float a = probs[i]; + float b = probs[i+blockDim.x]; + if (a > b) { + sdata[tid] = a; + indices[tid] = i; + } else { + sdata[tid] = b; + indices[tid] = i + blockDim.x; + } + } + + while (i + dist < batchFirstElements[batchIdx + 1]) { + i += dist; + + float a = probs[i]; + if (a > sdata[tid]) { + sdata[tid] = a; + indices[tid] = i; + } + + if (i + blockDim.x < batchFirstElements[batchIdx + 1]) { + float b = probs[i + blockDim.x]; + if (b > sdata[tid]) { + sdata[tid] = b; + indices[tid] = i + blockDim.x; + } + } + } + + __syncthreads(); + + for (int s = (blockDim.x >> 1); s > 32; s >>= 1) { + if (tid < s && tid + s < batchFirstElements[batchIdx + 1]) { + if (sdata[tid + s] > sdata[tid]) { + sdata[tid] = sdata[tid + s]; + indices[tid] = indices[tid + s]; + } + } + __syncthreads(); + } + + UNROLL_MAXARG_LOOP(32, batchFirstElements[batchIdx + 1]); + UNROLL_MAXARG_LOOP(16, batchFirstElements[batchIdx + 1]); + UNROLL_MAXARG_LOOP(8, batchFirstElements[batchIdx + 1]); + UNROLL_MAXARG_LOOP(4, batchFirstElements[batchIdx + 1]); + UNROLL_MAXARG_LOOP(2, batchFirstElements[batchIdx + 1]); + UNROLL_MAXARG_LOOP(1, batchFirstElements[batchIdx + 1]); + + if (tid == 0) { + binCosts[bestBinCostIdx] = sdata[0]; + binIdxs[bestBinCostIdx] = indices[0]; + } + __syncthreads(); + } +} + +__global__ void gGetValueByKey(float* d_in, float* d_out, int* indeces, int n) +{ + int tid = threadIdx.x + blockDim.x * blockIdx.x; + if (tid < n) { + int index = indeces[tid]; + d_out[tid] = d_in[index]; + } +} + +NthElement::NthElement(size_t maxBeamSize, size_t maxBatchSize, cudaStream_t stream) + : stream_(stream) , + NUM_BLOCKS(std::min(500, int(maxBeamSize * 85000 / (2 * BLOCK_SIZE)) + int(maxBeamSize * 85000 % (2 * BLOCK_SIZE) != 0))) +{ + //std::cerr << "NthElement::NthElement" << std::endl; + + HANDLE_ERROR( cudaMalloc((void**)&d_ind, maxBatchSize * NUM_BLOCKS * sizeof(int)) ); + + HANDLE_ERROR( cudaMalloc((void**)&d_out, maxBatchSize * NUM_BLOCKS * sizeof(float)) ); + + HANDLE_ERROR( cudaMalloc((void**)&d_res_idx, maxBatchSize * maxBeamSize * sizeof(int)) ); + HANDLE_ERROR( cudaMalloc((void**)&d_res, maxBatchSize * maxBeamSize * sizeof(float)) ); + + HANDLE_ERROR( cudaHostAlloc((void**) &h_res, maxBeamSize * maxBatchSize* sizeof(float), + cudaHostAllocDefault) ); + HANDLE_ERROR( cudaHostAlloc((void**) &h_res_idx, maxBeamSize * maxBatchSize * sizeof(int), + cudaHostAllocDefault) ); + + HANDLE_ERROR( cudaMalloc((void**)&d_breakdown, maxBeamSize * sizeof(float)) ); + HANDLE_ERROR( cudaMalloc((void**)&d_batchPosition, (maxBatchSize + 1) * sizeof(int)) ); + HANDLE_ERROR( cudaMalloc((void**)&d_cumBeamSizes, (maxBatchSize + 1) * sizeof(int)) ); +} + +NthElement::~NthElement() +{ + HANDLE_ERROR(cudaFree(d_ind)); + HANDLE_ERROR(cudaFree(d_out)); + HANDLE_ERROR(cudaFree(d_res_idx)); + HANDLE_ERROR(cudaFree(d_res)); + HANDLE_ERROR(cudaFreeHost(h_res)); + HANDLE_ERROR(cudaFreeHost(h_res_idx)); + HANDLE_ERROR(cudaFree(d_breakdown)); + HANDLE_ERROR(cudaFree(d_batchPosition)); + HANDLE_ERROR(cudaFree(d_cumBeamSizes)); +} + +void NthElement::getNBestList(float* probs, const std::vector& batchFirstElementIdxs, + const std::vector& cummulatedBeamSizes) +{ + HANDLE_ERROR( cudaMemcpyAsync(d_batchPosition, batchFirstElementIdxs.data(), batchFirstElementIdxs.size() * sizeof(int), + cudaMemcpyHostToDevice, stream_) ); + HANDLE_ERROR( cudaMemcpyAsync(d_cumBeamSizes, cummulatedBeamSizes.data(), cummulatedBeamSizes.size() * sizeof(int), + cudaMemcpyHostToDevice, stream_) ); + + const int numBatches = batchFirstElementIdxs.size() - 1; + + gMaxElement<<>> + (d_out, d_ind, probs, numBatches, d_batchPosition); + + gMaxElementUpdate<<>> + (d_out, d_ind, probs, d_batchPosition, d_res, d_res_idx, d_cumBeamSizes, NUM_BLOCKS); +} + +void NthElement::getNBestList(const std::vector& beamSizes, Tensor Probs, + std::vector& outCosts, std::vector& outKeys, + const bool isFirst) { + std::vector cummulatedBeamSizes(beamSizes.size() + 1, 0); + std::vector batchFirstElementIdxs(beamSizes.size() + 1, 0); + + const size_t vocabSize = Probs->shape()[1]; + for (size_t i = 0; i < beamSizes.size(); ++i) { + + cummulatedBeamSizes[i + 1] = cummulatedBeamSizes[i] + beamSizes[i]; + batchFirstElementIdxs[i + 1] += ((isFirst) ? (i + 1) : cummulatedBeamSizes[i + 1]) * vocabSize; + } + + getNBestList(Probs->data(), batchFirstElementIdxs, cummulatedBeamSizes); + GetPairs(cummulatedBeamSizes.back(), outKeys, outCosts); + +} + +void NthElement::GetPairs(size_t number, + std::vector& outKeys, + std::vector& outValues) { + + HANDLE_ERROR( cudaMemcpyAsync(h_res, d_res, number * sizeof(float), + cudaMemcpyDeviceToHost, stream_) ); + HANDLE_ERROR( cudaMemcpyAsync(h_res_idx, d_res_idx, number * sizeof(int), + cudaMemcpyDeviceToHost, stream_) ); + cudaStreamSynchronize(stream_); + + for (size_t i = 0; i < number; ++i) { + outKeys.push_back(h_res_idx[i]); + outValues.push_back(h_res[i]); + } + + lastN = number; +} + +void NthElement::getValueByKey(std::vector& out, float* d_in) { + gGetValueByKey<<<1, lastN, 0, stream_>>> + (d_in, d_breakdown, h_res_idx, lastN); + + HANDLE_ERROR( cudaMemcpyAsync(out.data(), d_breakdown, lastN * sizeof(float), + cudaMemcpyDeviceToHost, stream_) ); + HANDLE_ERROR( cudaStreamSynchronize(stream_)); +} + +} + diff --git a/src/translator/nth_element.h b/src/translator/nth_element.h new file mode 100644 index 00000000..9ac25da5 --- /dev/null +++ b/src/translator/nth_element.h @@ -0,0 +1,51 @@ +#pragma once + +#include +#include + +#include +#include "tensors/tensor.h" + +namespace marian { + +class NthElement { + public: + NthElement() = delete; + NthElement(const NthElement ©) = delete; + NthElement(size_t maxBeamSize, size_t maxBatchSize, cudaStream_t stream); + virtual ~NthElement(); + + void getNBestList(float* probs, const std::vector& batchFirstElementIdxs, + const std::vector& cummulatedBeamSizes); + + void getNBestList(const std::vector& beamSizes, Tensor Probs, + std::vector& outCosts, std::vector& outKeys, + const bool isFirst=false); + + void GetPairs(size_t number, + std::vector& outKeys, + std::vector& outValues); + + void getValueByKey(std::vector& out, float* d_in); + + private: + const int BLOCK_SIZE = 512; + const int NUM_BLOCKS; + cudaStream_t stream_; + int *d_ind; + + float *d_out; + + int *d_res_idx; + float *d_res; + + int *h_res_idx; + float *h_res; + + float *d_breakdown; + int *d_batchPosition; + int *d_cumBeamSizes; + size_t lastN; +}; + +}