parse parallel data

2024-09-11 06:15:56 +03:00 · 2016-09-16 13:29:49 +02:00 · 2016-09-16 13:29:49 +02:00 · 1833ea1eab
commit 1833ea1eab
parent 383b82c6f9
2 changed files with 18 additions and 1 deletions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -9,6 +9,7 @@ cuda_add_library(marian_lib
  tensor.cu
  tensor_operators.cu
  expression_operators.cu
+  vocab.cpp
 )

 target_link_libraries(marian_lib)
--- a/src/test.cu
+++ b/src/test.cu
@ -1,13 +1,17 @@
-
+#include <fstream>
 #include "marian.h"
 #include "mnist.h"
+#include "vocab.h"

 int main(int argc, char** argv) {
  cudaSetDevice(0);

+  using namespace std;
  using namespace marian;
  using namespace keywords;

+  Vocab sourceVocab, targetVocab;
+
  int input_size = 10;
  int output_size = 2;
  int batch_size = 25;
@ -30,6 +34,18 @@ int main(int argc, char** argv) {
  Expr bh = g.param(shape={1, hidden_size}, init=uniform(), name="bh");
  Expr h0 = g.param(shape={1, hidden_size}, init=uniform(), name="h0");

+  // read parallel corpus from file
+  std::fstream sourceFile("../examples/mt/dev/newstest2013.de");
+  std::fstream targetFile("../examples/mt/dev/newstest2013.en");
+
+  string sourceLine, targetLine;
+  while (getline(sourceFile, sourceLine)) {
+	  getline(targetFile, targetLine);
+
+	  std::vector<size_t> sourceIds = sourceVocab.ProcessSentence(sourceLine);
+	  std::vector<size_t> targetIds = sourceVocab.ProcessSentence(targetLine);
+  }
+
  std::cerr << "Building RNN..." << std::endl;
  H.emplace_back(tanh(dot(X[0], Wxh) + dot(h0, Whh) + bh));
  for (int t = 1; t < num_inputs; ++t) {