diff --git a/.gitignore b/.gitignore
index f78c8028..53468680 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,6 +40,3 @@ build
# Examples
examples/*/*.gz
examples/mnist/*ubyte
-
-.cproject
-.project
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 28287da1..0ad27c34 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,7 @@ project(marian CXX)
find_package(CUDA "8.0" REQUIRED)
SET(CMAKE_CXX_FLAGS " -std=c++11 -g -O3 -Wno-unused-result -Wno-deprecated -fPIC -Wno-deprecated-gpu-targets")
-LIST(APPEND CUDA_NVCC_FLAGS -std=c++11; --default-stream per-thread; -g; -O3; --use_fast_math; -Xcompiler '-fPIC'; -arch=sm_35; -DCUDNN)
+LIST(APPEND CUDA_NVCC_FLAGS -std=c++11; --default-stream per-thread; -g; -O3; --use_fast_math; -Xcompiler '-fPIC'; -arch=sm_35;)
SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
include_directories(${amunn_SOURCE_DIR})
diff --git a/README.md b/README.md
index 502df966..c4b5fa1a 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,8 @@
Marian
======
-[![Join the chat at https://gitter.im/MarianNMT/Lobby](https://badges.gitter.im/MarianNMT/Lobby.svg)](https://gitter.im/MarianNMT/Lobby?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
+[![Join the chat at https://gitter.im/amunmt/marian](https://badges.gitter.im/amunmt/marian.svg)](https://gitter.im/amunmt/marian?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
+[![Build Status](http://vali.inf.ed.ac.uk/jenkins/buildStatus/icon?job=Marian)](http://vali.inf.ed.ac.uk/jenkins/job/Marian/)
Google group for commit messages: https://groups.google.com/forum/#!forum/mariannmt
@@ -17,30 +18,12 @@ Installation
Requirements:
* g++ with c++11
-* CUDA and CuDNN
+* CUDA
* Boost (>= 1.56)
-Exporting some paths for CuDNN may be required (put it, for example, in your `.bashrc` file):
-
- export PATH=$PATH:$HOME/.local/bin:/usr/local/cuda/bin
- export LIBRARY_PATH=$LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/local/cudnn-5/lib64
- export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:/usr/local/cudnn-5/lib64
- export CPATH=$CPATH:/usr/local/cudnn-5/include
-
Compilation with `cmake > 3.5`:
mkdir build
cd build
cmake ..
make -j
-
-To compile API documentation using Doxygen, first cd to the build directory, and then:
-
- make doc
-
-To test, first compile, then:
-
- cd examples/mnist
- make
- cd ../../build
- ./mnist_benchmark
diff --git a/marian/.cproject b/marian/.cproject
new file mode 100644
index 00000000..184c39a4
--- /dev/null
+++ b/marian/.cproject
@@ -0,0 +1,163 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/marian/.project b/marian/.project
new file mode 100644
index 00000000..d1163076
--- /dev/null
+++ b/marian/.project
@@ -0,0 +1,34 @@
+
+
+ marian
+
+
+
+
+
+ org.eclipse.cdt.managedbuilder.core.genmakebuilder
+ clean,full,incremental,
+
+
+
+
+ org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder
+ full,incremental,
+
+
+
+
+
+ org.eclipse.cdt.core.cnature
+ org.eclipse.cdt.core.ccnature
+ org.eclipse.cdt.managedbuilder.core.managedBuildNature
+ org.eclipse.cdt.managedbuilder.core.ScannerConfigNature
+
+
+
+ src
+ 2
+ PARENT-1-PROJECT_LOC/src
+
+
+
diff --git a/src/3rd_party/threadpool.h b/src/3rd_party/threadpool.h
index fb77dfe6..1938b95c 100644
--- a/src/3rd_party/threadpool.h
+++ b/src/3rd_party/threadpool.h
@@ -45,6 +45,7 @@ class ThreadPool {
template
auto enqueue(F&& f, Args&&... args)
-> std::future::type>;
+
~ThreadPool();
size_t getNumTasks() const {
@@ -128,6 +129,3 @@ inline ThreadPool::~ThreadPool() {
worker.join();
}
}
-
-
-
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 162c53bd..c56b5c80 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -13,8 +13,13 @@ cuda_add_library(marian_lib
graph/node_operators.cu
tensors/tensor.cu
kernels/tensor_operators.cu
+ kernels/dropout.cu
layers/param_initializers.cpp
common/utils.cpp
+ common/logging.cpp
+ common/history.cpp
+ training/config.cpp
+ translator/nth_element.cu
data/vocab.cpp
data/corpus.cpp
$
@@ -27,30 +32,39 @@ cuda_add_executable(
test/tensor_test.cu
)
+cuda_add_executable(
+ marian_translate
+ test/marian_translate.cu
+)
+
cuda_add_executable(
marian_test
test/marian_test.cu
)
+cuda_add_executable(
+ bn_test
+ test/bn_test.cu
+)
+
cuda_add_executable(
marian
- command/config.cpp
command/marian.cu
)
cuda_add_executable(
dropout_test
test/dropout_test.cu
- kernels/dropout_cudnn.cu
)
target_link_libraries(marian marian_lib)
target_link_libraries(tensor_test marian_lib)
target_link_libraries(marian_test marian_lib)
target_link_libraries(dropout_test marian_lib)
+target_link_libraries(marian_translate marian_lib)
+target_link_libraries(bn_test marian_lib)
-foreach(exec tensor_test marian_test marian dropout_test)
- target_link_libraries(${exec} ${EXT_LIBS} cudnn)
+foreach(exec dropout_test tensor_test marian_test marian_translate marian bn_test)
target_link_libraries(${exec} ${EXT_LIBS} curand)
cuda_add_cublas_to_target(${exec})
set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
diff --git a/src/command/marian.cu b/src/command/marian.cu
index dd8b3301..8e255fa2 100644
--- a/src/command/marian.cu
+++ b/src/command/marian.cu
@@ -1,86 +1,13 @@
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
#include "marian.h"
-#include "optimizers/optimizers.h"
-#include "optimizers/clippers.h"
-#include "data/batch_generator.h"
-#include "data/corpus.h"
-#include "models/nematus.h"
-
-#include "common/logging.h"
-#include "command/config.h"
-#include "parallel/graph_group.h"
-
-namespace marian {
-
- void TrainingLoop(Ptr options,
- Ptr> batchGenerator) {
-
- auto reporter = New(options);
- Ptr graphGroup = New>(options);
- graphGroup->setReporter(reporter);
-
- size_t epochs = 1;
- size_t batches = 0;
- while((options->get("after-epochs") == 0
- || epochs <= options->get("after-epochs")) &&
- (options->get("after-batches") == 0
- || batches < options->get("after-batches"))) {
-
- batchGenerator->prepare(!options->get("no-shuffle"));
-
- boost::timer::cpu_timer timer;
-
- while(*batchGenerator) {
-
- auto batch = batchGenerator->next();
- graphGroup->update(batch);
-
- }
- epochs++;
- LOG(info) << "Starting epoch " << epochs << " after "
- << reporter->samples << " samples";
- }
- LOG(info) << "Training finshed";
- graphGroup->save();
- }
-}
+#include "models/gnmt.h"
int main(int argc, char** argv) {
using namespace marian;
- using namespace data;
- using namespace keywords;
- std::shared_ptr info;
- info = spdlog::stderr_logger_mt("info");
- info->set_pattern("[%Y-%m-%d %T] %v");
-
- auto options = New(argc, argv);
- std::cerr << *options << std::endl;
-
- auto dimVocabs = options->get>("dim-vocabs");
- int dimEmb = options->get("dim-emb");
- int dimRnn = options->get("dim-rnn");
- int dimBatch = options->get("mini-batch");
- int dimMaxiBatch = options->get("maxi-batch");
+ auto options = New(argc, argv);;
- auto trainSets = options->get>("trainsets");
- auto vocabs = options->get>("vocabs");
- size_t maxSentenceLength = options->get("max-length");
- auto corpus = New(trainSets, vocabs, dimVocabs, maxSentenceLength);
- auto bg = New>(corpus, dimBatch, dimMaxiBatch);
-
- TrainingLoop(options, bg);
+ Train>(options);
return 0;
}
diff --git a/src/common/definitions.h b/src/common/definitions.h
index 9e117a2a..997333f0 100644
--- a/src/common/definitions.h
+++ b/src/common/definitions.h
@@ -30,6 +30,7 @@
#include
#include "shape.h"
+#include "common/logging.h"
namespace marian {
@@ -93,7 +94,6 @@ namespace marian {
// An enumeration of directions
enum struct dir { forward, backward, bidirect };
-
/**
* @brief Defines a set of keywords.
*
@@ -101,27 +101,32 @@ namespace marian {
* will result in the creation of an instance of the Keyword class.
*/
namespace keywords {
- KEY(axis, int)
- KEY(shape, Shape)
- KEY(value, float)
- KEY(prefix, std::string)
- KEY(final, bool)
- KEY(output_last, bool)
- KEY(activation, act)
- KEY(direction, dir)
- KEY(mask, Expr)
- KEY(init, std::function)
+ KEY(axis, int);
+ KEY(shape, Shape);
+ KEY(value, float);
+ KEY(prefix, std::string);
+ KEY(final, bool);
+ KEY(output_last, bool);
+ KEY(activation, act);
+ KEY(direction, dir);
+ KEY(mask, Expr);
+ KEY(dropout_prob, float);
+ KEY(init, std::function);
- KEY(eta, float)
- KEY(beta1, float)
- KEY(beta2, float)
- KEY(eps, float)
- KEY(optimizer, Ptr)
- KEY(clip, Ptr)
- KEY(batch_size, int)
- KEY(max_epochs, int)
- KEY(valid, Ptr)
+ KEY(eta, float);
+ KEY(beta1, float);
+ KEY(beta2, float);
+ KEY(eps, float);
+ KEY(optimizer, Ptr);
+ KEY(clip, Ptr);
+ KEY(batch_size, int);
+ KEY(normalize, bool);
+ KEY(skip, bool);
+ KEY(skip_first, bool);
+ KEY(coverage, Expr);
+ KEY(max_epochs, int);
+ KEY(valid, Ptr);
}
}
diff --git a/src/common/history.cpp b/src/common/history.cpp
new file mode 100644
index 00000000..3d3ad857
--- /dev/null
+++ b/src/common/history.cpp
@@ -0,0 +1,10 @@
+#include "history.h"
+
+namespace marian {
+
+History::History(size_t lineNo)
+ : normalize_(true),
+ lineNo_(lineNo)
+{}
+
+}
diff --git a/src/common/history.h b/src/common/history.h
new file mode 100755
index 00000000..fca8a4b5
--- /dev/null
+++ b/src/common/history.h
@@ -0,0 +1,79 @@
+#pragma once
+
+#include
+
+#include "hypothesis.h"
+
+namespace marian {
+
+class History {
+ private:
+ struct HypothesisCoord {
+ bool operator<(const HypothesisCoord& hc) const {
+ return cost < hc.cost;
+ }
+
+ size_t i;
+ size_t j;
+ float cost;
+ };
+
+ public:
+ History(size_t lineNo);
+
+ void Add(const Beam& beam, bool last = false) {
+ if (beam.back()->GetPrevHyp() != nullptr) {
+ for (size_t j = 0; j < beam.size(); ++j)
+ if(beam[j]->GetWord() == 0 || last) {
+ float cost = normalize_ ? beam[j]->GetCost() / history_.size() : beam[j]->GetCost();
+ topHyps_.push({ history_.size(), j, cost });
+ }
+ }
+ history_.push_back(beam);
+ }
+
+ size_t size() const {
+ return history_.size();
+ }
+
+ NBestList NBest(size_t n) const {
+ NBestList nbest;
+ auto topHypsCopy = topHyps_;
+ while (nbest.size() < n && !topHypsCopy.empty()) {
+ auto bestHypCoord = topHypsCopy.top();
+ topHypsCopy.pop();
+
+ size_t start = bestHypCoord.i;
+ size_t j = bestHypCoord.j;
+
+ Words targetWords;
+ Ptr bestHyp = history_[start][j];
+ while(bestHyp->GetPrevHyp() != nullptr) {
+ targetWords.push_back(bestHyp->GetWord());
+ bestHyp = bestHyp->GetPrevHyp();
+ }
+
+ std::reverse(targetWords.begin(), targetWords.end());
+ nbest.emplace_back(targetWords, history_[bestHypCoord.i][bestHypCoord.j]);
+ }
+ return nbest;
+ }
+
+ Result Top() const {
+ return NBest(1)[0];
+ }
+
+ size_t GetLineNum() const
+ { return lineNo_; }
+
+ private:
+ std::vector history_;
+ std::priority_queue topHyps_;
+ bool normalize_;
+ size_t lineNo_;
+
+};
+
+typedef std::vector Histories;
+
+}
diff --git a/src/common/hypothesis.h b/src/common/hypothesis.h
new file mode 100644
index 00000000..08744566
--- /dev/null
+++ b/src/common/hypothesis.h
@@ -0,0 +1,58 @@
+#pragma once
+#include
+
+#include "common/definitions.h"
+
+namespace marian {
+
+class Hypothesis {
+ public:
+ Hypothesis()
+ : prevHyp_(nullptr),
+ prevIndex_(0),
+ word_(0),
+ cost_(0.0)
+ {}
+
+ Hypothesis(const Ptr prevHyp, size_t word, size_t prevIndex, float cost)
+ : prevHyp_(prevHyp),
+ prevIndex_(prevIndex),
+ word_(word),
+ cost_(cost)
+ {}
+
+ const Ptr GetPrevHyp() const {
+ return prevHyp_;
+ }
+
+ size_t GetWord() const {
+ return word_;
+ }
+
+ size_t GetPrevStateIndex() const {
+ return prevIndex_;
+ }
+
+ float GetCost() const {
+ return cost_;
+ }
+
+ std::vector& GetCostBreakdown() {
+ return costBreakdown_;
+ }
+
+ private:
+ const Ptr prevHyp_;
+ const size_t prevIndex_;
+ const size_t word_;
+ const float cost_;
+ std::vector costBreakdown_;
+};
+
+typedef std::vector> Beam;
+typedef std::vector Beams;
+typedef std::vector Words;
+typedef std::pair> Result;
+typedef std::vector NBestList;
+
+}
diff --git a/src/common/keywords.h b/src/common/keywords.h
index db0b5f20..01eb5898 100644
--- a/src/common/keywords.h
+++ b/src/common/keywords.h
@@ -219,7 +219,7 @@ namespace keywords {
*/
#define KEY(name, value_type) \
typedef const Keyword name ## _k; \
-name ## _k name;
+name ## _k name
}
diff --git a/src/common/logging.cpp b/src/common/logging.cpp
new file mode 100644
index 00000000..8621205e
--- /dev/null
+++ b/src/common/logging.cpp
@@ -0,0 +1,42 @@
+#include "logging.h"
+#include "training/config.h"
+
+std::shared_ptr stderrLogger(const std::string& name,
+ const std::string& pattern,
+ const std::vector& files) {
+ std::vector sinks;
+
+ auto stderr_sink = spdlog::sinks::stderr_sink_mt::instance();
+ sinks.push_back(stderr_sink);
+
+ for(auto&& file : files) {
+ auto file_sink = std::make_shared(file, true);
+ sinks.push_back(file_sink);
+ }
+
+ auto logger = std::make_shared(name, begin(sinks), end(sinks));
+
+ spdlog::register_logger(logger);
+ logger->set_pattern(pattern);
+ return logger;
+}
+
+void createLoggers(const marian::Config& options) {
+
+ std::vector generalLogs;
+ std::vector validLogs;
+ if(options.has("log")) {
+ generalLogs.push_back(options.get("log"));
+ validLogs.push_back(options.get("log"));
+ }
+
+ if(options.has("valid-log")) {
+ validLogs.push_back(options.get("valid-log"));
+ }
+
+ Logger info{stderrLogger("info", "[%Y-%m-%d %T] %v", generalLogs)};
+ Logger config{stderrLogger("config", "[%Y-%m-%d %T] [config] %v", generalLogs)};
+ Logger memory{stderrLogger("memory", "[%Y-%m-%d %T] [memory] %v", generalLogs)};
+ Logger data{stderrLogger("data", "[%Y-%m-%d %T] [data] %v", generalLogs)};
+ Logger valid{stderrLogger("valid", "[%Y-%m-%d %T] [valid] %v", validLogs)};
+}
diff --git a/src/common/logging.h b/src/common/logging.h
index 9959fc44..2583a071 100644
--- a/src/common/logging.h
+++ b/src/common/logging.h
@@ -3,3 +3,15 @@
#include "spdlog/spdlog.h"
#define LOG(logger) spdlog::get(#logger)->info()
+
+typedef std::shared_ptr Logger;
+Logger stderrLogger(const std::string&, const std::string&,
+ const std::vector& = {});
+
+namespace marian {
+ class Config;
+}
+
+void createLoggers(const marian::Config& options);
+
+
diff --git a/src/data/batch_generator.h b/src/data/batch_generator.h
index 3c6832fc..353a4df2 100644
--- a/src/data/batch_generator.h
+++ b/src/data/batch_generator.h
@@ -5,7 +5,8 @@
#include
-#include "dataset.h"
+#include "data/dataset.h"
+#include "training/config.h"
namespace marian {
@@ -21,22 +22,24 @@ class BatchGenerator {
private:
Ptr data_;
+ Ptr options_;
+
typename DataSet::iterator current_;
- size_t batchSize_;
size_t maxiBatchSize_;
std::deque bufferedBatches_;
BatchPtr currentBatch_;
- void fillBatches() {
+ void fillBatches(bool shuffle=true) {
auto cmp = [](const sample& a, const sample& b) {
return a[0].size() < b[0].size();
};
std::priority_queue maxiBatch(cmp);
- while(current_ != data_->end() && maxiBatch.size() < maxiBatchSize_) {
+ int maxSize = options_->get("mini-batch") * options_->get("maxi-batch");
+ while(current_ != data_->end() && maxiBatch.size() < maxSize) {
maxiBatch.push(*current_);
current_++;
}
@@ -45,7 +48,7 @@ class BatchGenerator {
while(!maxiBatch.empty()) {
batchVector.push_back(maxiBatch.top());
maxiBatch.pop();
- if(batchVector.size() == batchSize_) {
+ if(batchVector.size() == options_->get("mini-batch")) {
bufferedBatches_.push_back(data_->toBatch(batchVector));
batchVector.clear();
}
@@ -53,17 +56,15 @@ class BatchGenerator {
if(!batchVector.empty())
bufferedBatches_.push_back(data_->toBatch(batchVector));
- std::random_shuffle(bufferedBatches_.begin(), bufferedBatches_.end());
+ if(shuffle)
+ std::random_shuffle(bufferedBatches_.begin(), bufferedBatches_.end());
}
public:
BatchGenerator(Ptr data,
- size_t batchSize=80,
- size_t maxiBatchNum=20)
+ Ptr options)
: data_(data),
- batchSize_(batchSize),
- maxiBatchSize_(batchSize * maxiBatchNum)
- { }
+ options_(options) { }
operator bool() const {
return !bufferedBatches_.empty();
@@ -84,8 +85,10 @@ class BatchGenerator {
void prepare(bool shuffle=true) {
if(shuffle)
data_->shuffle();
+ else
+ data_->reset();
current_ = data_->begin();
- fillBatches();
+ fillBatches(shuffle);
}
};
diff --git a/src/data/corpus.cpp b/src/data/corpus.cpp
index 03da5ea2..6230bd68 100644
--- a/src/data/corpus.cpp
+++ b/src/data/corpus.cpp
@@ -1,5 +1,6 @@
#include
-#include "corpus.h"
+
+#include "data/corpus.h"
namespace marian {
namespace data {
@@ -33,20 +34,53 @@ const SentenceTuple& CorpusIterator::dereference() const {
return tup_;
}
-Corpus::Corpus(const std::vector& textPaths,
- const std::vector& vocabPaths,
- const std::vector& maxVocabs,
- size_t maxLength)
- : textPaths_(textPaths),
- maxLength_(maxLength)
-{
- UTIL_THROW_IF2(textPaths.size() != vocabPaths.size(),
+Corpus::Corpus(Ptr options)
+ : options_(options),
+ textPaths_(options_->get>("train-sets")),
+ maxLength_(options_->get("max-length")) {
+
+ std::vector vocabPaths;
+ if(options_->has("vocabs"))
+ vocabPaths = options_->get>("vocabs");
+
+ UTIL_THROW_IF2(!vocabPaths.empty() && textPaths_.size() != vocabPaths.size(),
"Number of corpus files and vocab files does not agree");
+ std::vector maxVocabs =
+ options_->get>("dim-vocabs");
+
std::vector vocabs;
- for(int i = 0; i < vocabPaths.size(); ++i) {
- vocabs_.emplace_back(vocabPaths[i], maxVocabs[i]);
+ if(vocabPaths.empty()) {
+ for(int i = 0; i < textPaths_.size(); ++i) {
+ Ptr vocab = New();
+ vocab->loadOrCreate(textPaths_[i], maxVocabs[i]);
+ vocabs_.emplace_back(vocab);
+ }
}
+ else {
+ for(int i = 0; i < vocabPaths.size(); ++i) {
+ Ptr vocab = New();
+ vocab->load(vocabPaths[i], maxVocabs[i]);
+ vocabs_.emplace_back(vocab);
+ }
+ }
+
+
+ for(auto path : textPaths_) {
+ files_.emplace_back(new InputFileStream(path));
+ }
+}
+
+Corpus::Corpus(std::vector paths,
+ std::vector> vocabs,
+ Ptr options)
+ : options_(options),
+ textPaths_(paths),
+ vocabs_(vocabs),
+ maxLength_(options_->get("max-length")) {
+
+ UTIL_THROW_IF2(textPaths_.size() != vocabs_.size(),
+ "Number of corpus files and vocab files does not agree");
for(auto path : textPaths_) {
files_.emplace_back(new InputFileStream(path));
@@ -61,7 +95,7 @@ SentenceTuple Corpus::next() {
for(int i = 0; i < files_.size(); ++i) {
std::string line;
if(std::getline((std::istream&)*files_[i], line)) {
- Words words = vocabs_[i](line);
+ Words words = (*vocabs_[i])(line);
if(words.empty())
words.push_back(0);
tup.push_back(words);
@@ -82,8 +116,15 @@ void Corpus::shuffle() {
shuffleFiles(textPaths_);
}
+void Corpus::reset() {
+ files_.clear();
+ for(auto& path : textPaths_) {
+ files_.emplace_back(new InputFileStream(path));
+ }
+}
+
void Corpus::shuffleFiles(const std::vector& paths) {
- std::cerr << "Shuffling files" << std::endl;
+ LOG(data) << "Shuffling files";
std::vector> corpus;
files_.clear();
@@ -129,7 +170,7 @@ void Corpus::shuffleFiles(const std::vector& paths) {
files_.emplace_back(new InputFileStream(path));
}
- std::cerr << "Done" << std::endl;
+ LOG(data) << "Done";
}
}
diff --git a/src/data/corpus.h b/src/data/corpus.h
index 58233a6e..c3a0c358 100644
--- a/src/data/corpus.h
+++ b/src/data/corpus.h
@@ -4,6 +4,7 @@
#include
#include
+#include "training/config.h"
#include "common/definitions.h"
#include "data/vocab.h"
#include "common/file_stream.h"
@@ -38,11 +39,11 @@ class CorpusBatch {
}
std::cerr << std::endl;
- std::cerr << "\t m: ";
- for(auto w : b.second) {
- std::cerr << w << " ";
- }
- std::cerr << std::endl;
+ //std::cerr << "\t m: ";
+ //for(auto w : b.second) {
+ //std::cerr << w << " ";
+ //}
+ //std::cerr << std::endl;
}
}
}
@@ -88,9 +89,11 @@ class CorpusIterator
class Corpus {
private:
+ Ptr options_;
+
std::vector textPaths_;
std::vector> files_;
- std::vector vocabs_;
+ std::vector> vocabs_;
size_t maxLength_;
void shuffleFiles(const std::vector& paths);
@@ -102,14 +105,17 @@ class Corpus {
typedef CorpusIterator iterator;
typedef SentenceTuple sample;
- Corpus(const std::vector& textPaths,
- const std::vector& vocabPaths,
- const std::vector& maxVocabs,
- size_t maxLength = 50);
+ Corpus(Ptr options);
+
+ Corpus(std::vector paths,
+ std::vector> vocabs,
+ Ptr options);
sample next();
void shuffle();
+
+ void reset();
iterator begin() {
return iterator(*this);
@@ -118,6 +124,10 @@ class Corpus {
iterator end() {
return iterator();
}
+
+ std::vector>& getVocabs() {
+ return vocabs_;
+ }
batch_ptr toBatch(const std::vector& batchVector) {
int batchSize = batchVector.size();
diff --git a/src/data/mnist.h b/src/data/mnist.h
deleted file mode 100644
index bf1815dc..00000000
--- a/src/data/mnist.h
+++ /dev/null
@@ -1,188 +0,0 @@
-#pragma once
-
-// This file is part of the Marian toolkit.
-// Marian is copyright (c) 2016 Marcin Junczys-Dowmunt.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-// SOFTWARE.
-
-#include
-#include
-#include
-#include
-#include
-#include
-
-#include "dataset.h"
-#include "batch_generator.h"
-
-namespace marian {
-namespace data {
-
-/** @brief DataBase capable of reading MNIST data. */
-class MNIST : public DataBase {
- private:
- const int IMAGE_MAGIC_NUMBER;
- const int LABEL_MAGIC_NUMBER;
-
- Examples examples_;
-
- public:
-
- typedef Batch batch_type;
- typedef std::shared_ptr batch_ptr;
-
- /**
- * @brief Constructs a DataBase using MNIST data.
- *
- * @param featuresPath Path to file containing MNIST feature values
- * @param labelsPath Path to file containing MNIST labels
- */
- MNIST(const std::string& featuresPath,
- const std::string& labelsPath)
- : IMAGE_MAGIC_NUMBER(2051),
- LABEL_MAGIC_NUMBER(2049)
- {
- auto features = ReadImages(featuresPath);
- auto labels = ReadLabels(labelsPath);
-
- UTIL_THROW_IF2(features.size() != labels.size(),
- "Features do not match labels");
-
- for(int i = 0; i < features.size(); ++i)
- examples_.emplace_back(new Example({ features[i], labels[i] }));
- }
-
- ExampleIterator begin() const {
- return ExampleIterator(examples_.begin());
- }
-
- ExampleIterator end() const {
- return ExampleIterator(examples_.end());
- }
-
- void shuffle() {
- std::random_shuffle(examples_.begin(), examples_.end());
- }
-
- batch_ptr toBatch(const Examples& batchVector) {
- int batchSize = batchVector.size();
-
- std::vector maxDims;
- for(auto& ex : batchVector) {
- if(maxDims.size() < ex->size())
- maxDims.resize(ex->size(), 0);
- for(int i = 0; i < ex->size(); ++i) {
- if((*ex)[i]->size() > maxDims[i])
- maxDims[i] = (*ex)[i]->size();
- }
- }
-
- batch_ptr batch(new Batch());
- std::vector iterators;
- for(auto& m : maxDims) {
- batch->push_back(Shape({batchSize, m}));
- iterators.push_back(batch->inputs().back().begin());
- }
-
- for(auto& ex : batchVector) {
- for(int i = 0; i < ex->size(); ++i) {
- DataPtr d = (*ex)[i];
- d->resize(maxDims[i], 0.0f);
- iterators[i] = std::copy(d->begin(), d->end(), iterators[i]);
- }
- }
- return batch;
- }
-
- private:
- typedef unsigned char uchar;
-
- int reverseInt(int i) {
- unsigned char c1, c2, c3, c4;
- c1 = i & 255, c2 = (i >> 8) & 255, c3 = (i >> 16) & 255, c4 = (i >> 24) & 255;
- return ((int)c1 << 24) + ((int)c2 << 16) + ((int)c3 << 8) + c4;
- }
-
- std::vector ReadImages(const std::string& full_path) {
- std::ifstream file(full_path);
- UTIL_THROW_IF2(!file.is_open(),
- "Cannot open file `" + full_path + "`!");
-
- int magic_number = 0;
- file.read((char *)&magic_number, sizeof(magic_number));
- magic_number = reverseInt(magic_number);
-
- UTIL_THROW_IF2(magic_number != IMAGE_MAGIC_NUMBER,
- "Invalid MNIST image file!");
-
- int number_of_images;
- int n_rows = 0;
- int n_cols = 0;
-
- file.read((char *)&number_of_images, sizeof(number_of_images));
- number_of_images = reverseInt(number_of_images);
- file.read((char *)&n_rows, sizeof(n_rows));
- n_rows = reverseInt(n_rows);
- file.read((char *)&n_cols, sizeof(n_cols));
- n_cols = reverseInt(n_cols);
-
- int imgSize = n_rows * n_cols;
- std::vector _dataset(number_of_images);
- for(int i = 0; i < number_of_images; ++i) {
- _dataset[i].reset(new Data(imgSize, 0));
- for (int j = 0; j < imgSize; j++) {
- unsigned char pixel = 0;
- file.read((char*)&pixel, sizeof(pixel));
- (*_dataset[i])[j] = pixel / 255.0f;
- }
- }
- return _dataset;
- }
-
- std::vector ReadLabels(const std::string& full_path) {
- std::ifstream file(full_path);
-
- if (! file.is_open())
- throw std::runtime_error("Cannot open file `" + full_path + "`!");
-
- int magic_number = 0;
- file.read((char *)&magic_number, sizeof(magic_number));
- magic_number = reverseInt(magic_number);
-
- if (magic_number != LABEL_MAGIC_NUMBER)
- throw std::runtime_error("Invalid MNIST label file!");
-
- int number_of_labels;
- file.read((char *)&number_of_labels, sizeof(number_of_labels));
- number_of_labels = reverseInt(number_of_labels);
-
- std::vector _dataset(number_of_labels);
- for (int i = 0; i < number_of_labels; i++) {
- _dataset[i].reset(new Data(1, 0.0f));
- unsigned char label;
- file.read((char*)&label, 1);
- (*_dataset[i])[0] = label;
- }
-
- return _dataset;
- }
-};
-
-} // namespace mnist
-}
diff --git a/src/data/trainer.h b/src/data/trainer.h
deleted file mode 100644
index 3c9420df..00000000
--- a/src/data/trainer.h
+++ /dev/null
@@ -1,142 +0,0 @@
-#pragma once
-
-#include
-#include
-#include
-
-#include "common/keywords.h"
-#include "common/definitions.h"
-#include "graph/expression_graph.h"
-#include "optimizers/optimizers.h"
-#include "data/batch_generator.h"
-
-namespace marian {
-
-class RunBase {
- public:
- virtual void run() = 0;
-};
-
-typedef std::shared_ptr RunBasePtr;
-
-template
-class Trainer : public RunBase,
- public keywords::Keywords {
- private:
- ExpressionGraphPtr graph_;
- std::shared_ptr dataset_;
-
- public:
- template
- Trainer(ExpressionGraphPtr graph,
- std::shared_ptr dataset,
- Args... args)
- : Keywords(args...),
- graph_(graph),
- dataset_(dataset)
- {}
-
- void run() {
- using namespace data;
- using namespace keywords;
- boost::timer::cpu_timer trainTimer;
-
- auto opt = Get(optimizer, Optimizer());
- auto batchSize = Get(batch_size, 200);
- auto maxEpochs = Get(max_epochs, 50);
- BatchGenerator bg(dataset_, batchSize);
-
- auto validator = Get(valid, RunBasePtr());
-
- size_t update = 0;
- for(int epoch = 1; epoch <= maxEpochs; ++epoch) {
- boost::timer::cpu_timer epochTimer;
- bg.prepare();
-
- float cost = 0;
- float totalExamples = 0;
- while(bg) {
- auto batch = bg.next();
- opt->update(graph_);
- cost += graph_->get("cost")->val()->scalar() * batch->dim();
- totalExamples += batch->dim();
- update++;
- }
- cost = cost / totalExamples;
-
- std::cerr << "Epoch: " << std::setw(std::to_string(maxEpochs).size())
- << epoch << "/" << maxEpochs << " - Update: " << update
- << " - Cost: " << std::fixed << std::setprecision(4) << cost
- << " - Time: " << epochTimer.format(2, "%ws")
- << " - " << trainTimer.format(0, "%ws") << std::endl;
-
- if(validator)
- validator->run();
- }
- }
-};
-
-template
-class Validator : public RunBase,
- public keywords::Keywords {
- private:
- ExpressionGraphPtr graph_;
- std::shared_ptr dataset_;
-
- float correct(const std::vector pred, const std::vector labels) {
- size_t num = labels.size();
- size_t scores = pred.size() / num;
- size_t acc = 0;
- for (size_t i = 0; i < num; ++i) {
- size_t proposed = 0;
- for(size_t j = 0; j < scores; ++j) {
- if(pred[i * scores + j] > pred[i * scores + proposed])
- proposed = j;
- }
- acc += (proposed == labels[i]);
- }
- return (float)acc;
- }
-
- public:
- template
- Validator(ExpressionGraphPtr graph,
- std::shared_ptr dataset,
- Args... args)
- : Keywords(args...),
- graph_(graph),
- dataset_(dataset)
- {}
-
- void run() {
- using namespace data;
- using namespace keywords;
-
- auto batchSize = Get(batch_size, 200);
- BatchGenerator bg(dataset_, batchSize);
-
- size_t update = 0;
- bg.prepare(false);
-
- float total = 0;
- float cor = 0;
- while(bg) {
- auto batch = bg.next();
- graph_->forward();
- std::vector scores;
- graph_->get("scores")->val()->get(scores);
-
- cor += correct(scores, batch->inputs()[1].data());
- total += batch->dim();
- update++;
- }
- std::cerr << "Accuracy: " << cor / total << std::endl;
- }
-};
-
-template
-RunBasePtr Run(Args&& ...args) {
- return RunBasePtr(new Process(args...));
-}
-
-}
diff --git a/src/data/types.h b/src/data/types.h
index d126395d..4232a985 100644
--- a/src/data/types.h
+++ b/src/data/types.h
@@ -7,6 +7,8 @@
typedef size_t Word;
typedef std::vector Words;
-const Word EOS = 0;
-const Word UNK = 1;
+const Word EOS_ID = 0;
+const Word UNK_ID = 1;
+const std::string EOS_STR = "";
+const std::string UNK_STR = "";
diff --git a/src/data/vocab.cpp b/src/data/vocab.cpp
index 372806f9..4d62c765 100644
--- a/src/data/vocab.cpp
+++ b/src/data/vocab.cpp
@@ -1,26 +1,15 @@
#include
+#include
#include "data/vocab.h"
#include "common/utils.h"
#include "common/file_stream.h"
#include "3rd_party/exception.h"
#include "3rd_party/yaml-cpp/yaml.h"
+#include "common/logging.h"
-Vocab::Vocab(const std::string& path, int max) {
- YAML::Node vocab = YAML::Load(InputFileStream(path));
- for(auto&& pair : vocab) {
- auto str = pair.first.as();
- auto id = pair.second.as();
- if (id < (Word)max) {
- str2id_[str] = id;
- if(id >= id2str_.size())
- id2str_.resize(id + 1);
- id2str_[id] = str;
- }
- }
- UTIL_THROW_IF2(id2str_.empty(), "Empty vocabulary " << path);
- id2str_[0] = "";
+Vocab::Vocab() {
}
size_t Vocab::operator[](const std::string& word) const {
@@ -28,7 +17,7 @@ size_t Vocab::operator[](const std::string& word) const {
if(it != str2id_.end())
return it->second;
else
- return 1;
+ return UNK_ID;
}
Words Vocab::operator()(const std::vector& lineTokens, bool addEOS) const {
@@ -36,7 +25,7 @@ Words Vocab::operator()(const std::vector& lineTokens, bool addEOS)
std::transform(lineTokens.begin(), lineTokens.end(), words.begin(),
[&](const std::string& w) { return (*this)[w]; });
if(addEOS)
- words.push_back(EOS);
+ words.push_back(EOS_ID);
return words;
}
@@ -49,7 +38,7 @@ Words Vocab::operator()(const std::string& line, bool addEOS) const {
std::vector Vocab::operator()(const Words& sentence, bool ignoreEOS) const {
std::vector decoded;
for(size_t i = 0; i < sentence.size(); ++i) {
- if(sentence[i] != EOS || !ignoreEOS) {
+ if(sentence[i] != EOS_ID || !ignoreEOS) {
decoded.push_back((*this)[sentence[i]]);
}
}
@@ -65,3 +54,91 @@ const std::string& Vocab::operator[](size_t id) const {
size_t Vocab::size() const {
return id2str_.size();
}
+
+void Vocab::loadOrCreate(const std::string& trainPath, int max)
+{
+ if(boost::filesystem::exists(trainPath + ".json")) {
+ load(trainPath + ".json", max);
+ return;
+ }
+ if(boost::filesystem::exists(trainPath + ".yml")) {
+ load(trainPath + ".yml", max);
+ return;
+ }
+
+ create(trainPath + ".yml", max, trainPath);
+ load(trainPath + ".yml", max);
+}
+
+void Vocab::load(const std::string& vocabPath, int max)
+{
+ LOG(data) << "Loading vocabulary from " << vocabPath << " (max: " << max << ")";
+ YAML::Node vocab = YAML::Load(InputFileStream(vocabPath));
+ for(auto&& pair : vocab) {
+ auto str = pair.first.as();
+ auto id = pair.second.as();
+ if (id < (Word)max) {
+ str2id_[str] = id;
+ if(id >= id2str_.size())
+ id2str_.resize(id + 1);
+ id2str_[id] = str;
+ }
+ }
+ UTIL_THROW_IF2(id2str_.empty(), "Empty vocabulary " << vocabPath);
+
+ id2str_[EOS_ID] = EOS_STR;
+ id2str_[UNK_ID] = UNK_STR;
+}
+
+class Vocab::VocabFreqOrderer
+{
+public:
+ bool operator()(const Vocab::Str2Id::value_type* a, const Vocab::Str2Id::value_type* b) const {
+ return a->second < b->second;
+ }
+};
+
+void Vocab::create(const std::string& vocabPath, int max, const std::string& trainPath)
+{
+ LOG(data) << "Creating vocabulary " << vocabPath
+ << " from " << trainPath << " (max: " << max << ")";
+
+ UTIL_THROW_IF2(boost::filesystem::exists(vocabPath),
+ "Vocab file " << vocabPath << " exist. Not overwriting");
+
+ InputFileStream trainStrm(trainPath);
+
+ Str2Id vocab;
+ std::string line;
+ while (getline((std::istream&)trainStrm, line)) {
+ std::vector toks;
+ Split(line, toks);
+
+ for (const std::string &tok: toks) {
+ Str2Id::iterator iter = vocab.find(tok);
+ if (iter == vocab.end())
+ vocab[tok] = 1;
+ else
+ iter->second++;
+ }
+ }
+
+ // put into vector & sort
+ std::vector vocabVec;
+ vocabVec.reserve(max);
+
+ for (const Str2Id::value_type &p: vocab)
+ vocabVec.push_back(&p);
+ std::sort(vocabVec.rbegin(), vocabVec.rend(), VocabFreqOrderer());
+
+ YAML::Node vocabYaml;
+ vocabYaml[EOS_STR] = EOS_ID;
+ vocabYaml[UNK_STR] = UNK_ID;
+ for(size_t i = 0; i < vocabVec.size(); ++i) {
+ const Str2Id::value_type *p = vocabVec[i];
+ vocabYaml[p->first] = i + 2;
+ }
+
+ OutputFileStream vocabStrm(vocabPath);
+ (std::ostream&)vocabStrm << vocabYaml;
+}
diff --git a/src/data/vocab.h b/src/data/vocab.h
index e7e2c416..e61ad605 100644
--- a/src/data/vocab.h
+++ b/src/data/vocab.h
@@ -8,7 +8,7 @@
class Vocab {
public:
- Vocab(const std::string& path, int max = 50000);
+ Vocab();
size_t operator[](const std::string& word) const;
@@ -22,7 +22,16 @@ class Vocab {
size_t size() const;
+ void loadOrCreate(const std::string& textPath, int max);
+ void load(const std::string& vocabPath, int max);
+ void create(const std::string& vocabPath, int max, const std::string& trainPath);
+
private:
- std::map str2id_;
- std::vector id2str_;
+ typedef std::map Str2Id;
+ Str2Id str2id_;
+
+ typedef std::vector Id2Str;
+ Id2Str id2str_;
+
+ class VocabFreqOrderer;
};
diff --git a/src/graph/chainable.h b/src/graph/chainable.h
index ef358325..07bc294e 100644
--- a/src/graph/chainable.h
+++ b/src/graph/chainable.h
@@ -23,6 +23,7 @@
#include
#include
+#include
#include "exception.h"
@@ -106,6 +107,8 @@ struct Chainable {
virtual void debug(const std::string& message) = 0;
virtual bool marked_for_debug() = 0;
virtual const std::string& debug_message() = 0;
+
+ virtual size_t hash() = 0;
};
/**
diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h
index 15885553..dc30803f 100644
--- a/src/graph/expression_graph.h
+++ b/src/graph/expression_graph.h
@@ -1,26 +1,5 @@
#pragma once
-// This file is part of the Marian toolkit.
-// Marian is copyright (c) 2016 Marcin Junczys-Dowmunt.
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-// SOFTWARE.
-
#include