From 1833ea1eab5859c02218510d38d3384a10ce3e42 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 16 Sep 2016 13:29:49 +0200 Subject: [PATCH] parse parallel data --- src/CMakeLists.txt | 1 + src/test.cu | 18 +++++++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c772e360..e58473cf 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -9,6 +9,7 @@ cuda_add_library(marian_lib tensor.cu tensor_operators.cu expression_operators.cu + vocab.cpp ) target_link_libraries(marian_lib) diff --git a/src/test.cu b/src/test.cu index d27591be..7da85c9d 100644 --- a/src/test.cu +++ b/src/test.cu @@ -1,13 +1,17 @@ - +#include #include "marian.h" #include "mnist.h" +#include "vocab.h" int main(int argc, char** argv) { cudaSetDevice(0); + using namespace std; using namespace marian; using namespace keywords; + Vocab sourceVocab, targetVocab; + int input_size = 10; int output_size = 2; int batch_size = 25; @@ -30,6 +34,18 @@ int main(int argc, char** argv) { Expr bh = g.param(shape={1, hidden_size}, init=uniform(), name="bh"); Expr h0 = g.param(shape={1, hidden_size}, init=uniform(), name="h0"); + // read parallel corpus from file + std::fstream sourceFile("../examples/mt/dev/newstest2013.de"); + std::fstream targetFile("../examples/mt/dev/newstest2013.en"); + + string sourceLine, targetLine; + while (getline(sourceFile, sourceLine)) { + getline(targetFile, targetLine); + + std::vector sourceIds = sourceVocab.ProcessSentence(sourceLine); + std::vector targetIds = sourceVocab.ProcessSentence(targetLine); + } + std::cerr << "Building RNN..." << std::endl; H.emplace_back(tanh(dot(X[0], Wxh) + dot(h0, Whh) + bh)); for (int t = 1; t < num_inputs; ++t) {