Made e-d use actual MT data instead of synthetic.

2024-11-03 20:13:47 +03:00 · 2016-09-16 18:27:54 +01:00 · 2016-09-16 18:27:54 +01:00 · c54eaf17d5
commit c54eaf17d5
parent 1b27accaa0
1 changed files with 127 additions and 25 deletions
--- a/src/validate_encoder_decoder.cu
+++ b/src/validate_encoder_decoder.cu
@ -7,17 +7,20 @@
 using namespace marian;
 using namespace keywords;
-const int input_size = 10;
+ExpressionGraph build_graph(int cuda_device,
-const int output_size = 15;
+                            int source_vocabulary_size,
-const int embedding_size = 8;
+                            int target_vocabulary_size,
-const int hidden_size = 5;
+                            int embedding_size,
-const int batch_size = 25;
+                            int hidden_size,
-const int num_inputs = 8;
+                            int num_source_tokens,
-const int num_outputs = 6;
+                            int num_target_tokens) {
 ExpressionGraph build_graph(int cuda_device) {
  std::cerr << "Building computation graph..." << std::endl;
  int input_size = source_vocabulary_size;
  int output_size = target_vocabulary_size;
  int num_inputs = num_source_tokens;
  int num_outputs = num_target_tokens;
  ExpressionGraph g(cuda_device);
  std::vector<Expr> X, Y, H, S;
@ -25,14 +28,14 @@ ExpressionGraph build_graph(int cuda_device) {
  for (int t = 0; t <= num_inputs; ++t) {
    std::stringstream ss;
    ss << "X" << t;
-    X.emplace_back(named(g.input(shape={batch_size, input_size}), ss.str()));
+    X.emplace_back(named(g.input(shape={whatevs, input_size}), ss.str()));
  }
  // We're including the stop symbol here.
  for (int t = 0; t <= num_outputs; ++t) {
    std::stringstream ss;
    ss << "Y" << t;
-    Y.emplace_back(named(g.input(shape={batch_size, output_size}), ss.str()));
+    Y.emplace_back(named(g.input(shape={whatevs, output_size}), ss.str()));
  }
  // Source embeddings.
@ -96,30 +99,125 @@ ExpressionGraph build_graph(int cuda_device) {
 int main(int argc, char** argv) {
 #if 1
  std::cerr << "Loading the data... ";
-  Vocab sourceVocab, targetVocab;
+  Vocab source_vocab, target_vocab;
  // read parallel corpus from file
-  std::fstream sourceFile("../examples/mt/dev/newstest2013.de");
+  std::fstream source_file("../examples/mt/dev/newstest2013.de");
-  std::fstream targetFile("../examples/mt/dev/newstest2013.en");
+  std::fstream target_file("../examples/mt/dev/newstest2013.en");
  // Right now we're only reading the first few sentence pairs, and defining
  // that as the step size.
  int batch_size = 64;
  int num_source_tokens = -1;
  int num_target_tokens = -1;
  std::vector<std::vector<size_t> > source_sentences, target_sentences;
-  std::string sourceLine, targetLine;
+  std::string source_line, target_line;
-  while (getline(sourceFile, sourceLine)) {
+  while (getline(source_file, source_line)) {
-    getline(targetFile, targetLine);
+    getline(target_file, target_line);
-    std::vector<size_t> sourceIds = sourceVocab.ProcessSentence(sourceLine);
+    std::vector<size_t> source_ids = source_vocab.ProcessSentence(source_line);
-    std::vector<size_t> targetIds = targetVocab.ProcessSentence(targetLine);
+    source_ids.push_back(source_vocab.GetEOS()); // Append EOS token.
-    source_sentences.push_back(sourceIds);
+    std::vector<size_t> target_ids = target_vocab.ProcessSentence(target_line);
-    target_sentences.push_back(targetIds);
+    target_ids.push_back(target_vocab.GetEOS()); // Append EOS token.
    source_sentences.push_back(source_ids);
    target_sentences.push_back(target_ids);
    if (num_source_tokens < 0 || source_ids.size() > num_source_tokens) {
      num_source_tokens = source_ids.size();
    }
    if (num_target_tokens < 0 || target_ids.size() > num_target_tokens) {
      num_target_tokens = target_ids.size();
    }
    if (source_sentences.size() == batch_size) break;
  }
  std::cerr << "Done." << std::endl;
  std::cerr << source_sentences.size()
            << " sentence pairs read." << std::endl;
-  std::cerr << "Source vocabulary size: " << sourceVocab.Size() << std::endl;
+  std::cerr << "Source vocabulary size: " << source_vocab.Size() << std::endl;
-  std::cerr << "Target vocabulary size: " << targetVocab.Size() << std::endl;
+  std::cerr << "Target vocabulary size: " << target_vocab.Size() << std::endl;
-#endif
+  std::cerr << "Max source tokens: " << num_source_tokens << std::endl;
  std::cerr << "Max target tokens: " << num_target_tokens << std::endl;
  // Padding the source and target sentences.
  for (auto &sentence : source_sentences) {
    for (int i = sentence.size(); i < num_source_tokens; ++i) {
      sentence.push_back(source_vocab.GetPAD());
    }
  }
  for (auto &sentence : target_sentences) {
    for (int i = sentence.size(); i < num_target_tokens; ++i) {
      sentence.push_back(target_vocab.GetPAD());
    }
  }
  std::cerr << "Building the encoder-decoder computation graph..." << std::endl;
  // Build the encoder-decoder computation graph.
-  ExpressionGraph g = build_graph(0);
+  int embedding_size = 50;
  int hidden_size = 100;
  ExpressionGraph g = build_graph(0, // cuda device.
                                  source_vocab.Size(),
                                  target_vocab.Size(),
                                  embedding_size,
                                  hidden_size,
                                  num_source_tokens-1,
                                  num_target_tokens-1);
  std::cerr << "Attaching the data to the computation graph..." << std::endl;
  // Convert the data to dense one-hot vectors.
  // TODO: make the graph handle sparse indices with a proper lookup layer.
  for (int t = 0; t < num_source_tokens; ++t) {
    Tensor Xt({batch_size, static_cast<int>(source_vocab.Size())});
    std::vector<float> values(batch_size * source_vocab.Size(), 0.0);
    int k = 0;
    for (int i = 0; i < batch_size; ++i) {
      values[k + source_sentences[i][t]] = 1.0;
      k += source_vocab.Size();
    }
    thrust::copy(values.begin(), values.end(), Xt.begin());
    // Attach this slice to the graph.
    std::stringstream ss;
    ss << "X" << t;
    g[ss.str()] = Xt;
  }
  for (int t = 0; t < num_target_tokens; ++t) {
    Tensor Yt({batch_size, static_cast<int>(target_vocab.Size())});
    std::vector<float> values(batch_size * target_vocab.Size(), 0.0);
    int k = 0;
    for (int i = 0; i < batch_size; ++i) {
      values[k + target_sentences[i][t]] = 1.0;
      k += target_vocab.Size();
    }
    thrust::copy(values.begin(), values.end(), Yt.begin());
    // Attach this slice to the graph.
    std::stringstream ss;
    ss << "Y" << t;
    g[ss.str()] = Yt;
  }
 #else
  int source_vocabulary_size = 10;
  int target_vocabulary_size = 15;
  int embedding_size = 8;
  int hidden_size = 5;
  int batch_size = 25;
  int num_source_tokens = 8;
  int num_target_tokens = 6;
  // Build the encoder-decoder computation graph.
  ExpressionGraph g = build_graph(0, // cuda device.
                                  source_vocabulary_size,
                                  target_vocabulary_size,
                                  embedding_size,
                                  hidden_size,
                                  num_source_tokens,
                                  num_target_tokens);
  int input_size = source_vocabulary_size;
  int output_size = target_vocabulary_size;
  int num_inputs = num_source_tokens;
  int num_outputs = num_target_tokens;
  // Generate input data (include the stop symbol).
  for (int t = 0; t <= num_inputs; ++t) {
@ -155,6 +253,8 @@ int main(int argc, char** argv) {
    ss << "Y" << t;
    g[ss.str()] = Yt;
  }
 #endif
  std::cerr << "Printing the computation graph..." << std::endl;
  std::cout << g.graphviz() << std::endl;
@ -167,6 +267,7 @@ int main(int argc, char** argv) {
  std::cerr << g["cost"].val().Debug() << std::endl;
 #if 0
  std::cerr << g["X0"].val().Debug() << std::endl;
  std::cerr << g["Y0"].val().Debug() << std::endl;
  std::cerr << g["Whh"].grad().Debug() << std::endl;
@ -175,6 +276,7 @@ int main(int argc, char** argv) {
  std::cerr << g["by"].grad().Debug() << std::endl;
  std::cerr << g["Wxh"].grad().Debug() << std::endl;
  std::cerr << g["h0"].grad().Debug() << std::endl;
 #endif
  return 0;
 }