Merge branch 'master' of github.com:emjotde/marian

2024-09-17 09:47:34 +03:00 · 2016-09-22 13:13:21 +02:00 · 2016-09-22 13:13:21 +02:00 · ade449a06a
commit ade449a06a
parent 54713826e6 55fa57c762
9 changed files with 118 additions and 130 deletions
--- a/src/chainable.h
+++ b/src/chainable.h
@ -48,10 +48,10 @@ struct Chainable {
    virtual const std::string label(const std::string& type) = 0;
    
    virtual const Shape& shape() = 0;
-    virtual DataType &val() = 0;
+    virtual const DataType &val() = 0;
    virtual DataType grad() = 0;
    virtual void setVal(DataType t) {
-      UTIL_THROW2("Tensors can only be assigned to input nodes"); 
+      UTIL_THROW2("Tensors can only be assigned to input and parameter nodes"); 
    };
 };

--- a/src/expression_graph.cu
+++ b/src/expression_graph.cu
@ -33,6 +33,10 @@ Tensor Expr::val() {
  return pimpl_->val();
 }

+void Expr::setVal(const Tensor &val) {
+  pimpl_->setVal(val);
+}
+
 Tensor Expr::grad() {
    return pimpl_->grad();
 }
--- a/src/expression_graph.h
+++ b/src/expression_graph.h
@ -48,6 +48,8 @@ class Expr {
    Tensor val();
    Tensor grad();

+    void setVal(const Tensor &val);
+
    ExpressionGraphPtr graph();
    
    ChainPtr node();
--- a/src/node.cu
+++ b/src/node.cu
@ -12,97 +12,97 @@ void Node::calc_numeric_grad(
 {
  using namespace std;

-	  size_t inputSize = GetTotalSize(input.shape());
-	  size_t valSize = GetTotalSize(val_.shape());
+  size_t inputSize = GetTotalSize(input.shape());
+  size_t valSize = GetTotalSize(val_.shape());

-	  UTIL_THROW_IF2(inputSize != GetTotalSize(grad.shape()),
-			  	  "inputSize != gradSize:" << inputSize << "!=" << GetTotalSize(grad.shape()));
-	  UTIL_THROW_IF2(valSize != GetTotalSize(adj_.shape()),
-			  	  "valSize != adjSize :" << valSize << "!=" << GetTotalSize(adj_.shape()));
+  UTIL_THROW_IF2(inputSize != GetTotalSize(grad.shape()),
+			  "inputSize != gradSize:" << inputSize << "!=" << GetTotalSize(grad.shape()));
+  UTIL_THROW_IF2(valSize != GetTotalSize(adj_.shape()),
+			  "valSize != adjSize :" << valSize << "!=" << GetTotalSize(adj_.shape()));

-	  cerr	<< "inputSize=grad=" << Debug(input.shape())<< "=" << inputSize << " "
-			<< "valSize=adj_=" << Debug(val_.shape()) << "=" << valSize
-			<< endl;
+  cerr	<< "inputSize=grad=" << Debug(input.shape())<< "=" << inputSize << " "
+		<< "valSize=adj_=" << Debug(val_.shape()) << "=" << valSize
+		<< endl;

-	  //cerr << "input=" << input.Debug() << endl;
-	  //cerr << "adj_=" << adj_.Debug() << endl;
+  //cerr << "input=" << input.Debug() << endl;
+  //cerr << "adj_=" << adj_.Debug() << endl;

-	  std::vector<float> origGrad(inputSize);
-	  thrust::copy(grad.begin(), grad.end(), origGrad.begin());
-	  cerr << "origGrad=" << grad.Debug() << endl;
-	  //output("diffGrad", diffGrad);
+  std::vector<float> origGrad(inputSize);
+  thrust::copy(grad.begin(), grad.end(), origGrad.begin());
+  cerr << "origGrad=" << grad.Debug() << endl;
+  //output("diffGrad", diffGrad);

-	  //output("prevCalcGrad", prevCalcGrad.begin(), prevCalcGrad.end());
+  //output("prevCalcGrad", prevCalcGrad.begin(), prevCalcGrad.end());

-	  std::vector<float> inputVec(inputSize);
-	  thrust::copy(input.begin(), input.end(), inputVec.begin());
-	  //output("inputVec", inputVec);
+  std::vector<float> inputVec(inputSize);
+  thrust::copy(input.begin(), input.end(), inputVec.begin());
+  //output("inputVec", inputVec);

-	  std::vector<float> newVal(inputSize, 0);
+  std::vector<float> newVal(inputSize, 0);

-	  // LOOP thru each element in input & add delta
-	  for (size_t inputInd = 0; inputInd < inputSize; ++inputInd) {
-		  inputVec[inputInd] += delta;
-		  thrust::copy(inputVec.begin(), inputVec.end(), input.begin());
-		  //output("input", input.begin(), input.end());
-
-		  forward();
-
-		  for (size_t i = 0; i < valSize; ++i) {
-			  newVal[inputInd] += val_[i];
-		  }
-		  //output("val_", val_.begin(), val_.end());
-
-		  inputVec[inputInd] -= delta;
-	  }
-
-	  // orig value
+  // LOOP thru each element in input & add delta
+  for (size_t inputInd = 0; inputInd < inputSize; ++inputInd) {
+	  inputVec[inputInd] += delta;
 	  thrust::copy(inputVec.begin(), inputVec.end(), input.begin());
+	  //output("input", input.begin(), input.end());
+
 	  forward();

-	  float sumValOrig = 0;
 	  for (size_t i = 0; i < valSize; ++i) {
-		  sumValOrig += val_[i];
+		  newVal[inputInd] += val_[i];
 	  }
+	  //output("val_", val_.begin(), val_.end());

-	  //output("newVal", newVal.begin(), newVal.end());
+	  inputVec[inputInd] -= delta;
+  }

-	  // calc gradient
-	  //cerr << "adj_=" << adj_.Debug() << endl;
-	  std::vector<float> adjVec(valSize);
-	  thrust::copy(adj_.begin(), adj_.end(), adjVec.begin());
+  // orig value
+  thrust::copy(inputVec.begin(), inputVec.end(), input.begin());
+  forward();

-	  std::vector<float> numericalGrad(inputSize);
-	  for (size_t i = 0; i < numericalGrad.size(); ++i) {
-		  numericalGrad[i] = (newVal[i] - sumValOrig) / delta;
-	  }
+  float sumValOrig = 0;
+  for (size_t i = 0; i < valSize; ++i) {
+	  sumValOrig += val_[i];
+  }

-	  broadcast(numericalGrad, adjVec);
-	  //std::cerr << "broadcast size=" << numericalGrad.size() << " " << adjVec.size() << std::endl;
-	  //output("adjVec=", adjVec.begin(), adjVec.end());
+  //output("newVal", newVal.begin(), newVal.end());

-	  for (size_t i = 0; i < numericalGrad.size(); ++i) {
-		  numericalGrad[i] *= adjVec[i];
-		  numericalGrad[i] += prevCalcGrad[i];
-	  }
+  // calc gradient
+  //cerr << "adj_=" << adj_.Debug() << endl;
+  std::vector<float> adjVec(valSize);
+  thrust::copy(adj_.begin(), adj_.end(), adjVec.begin());

-	  //output("prevCalcGrad=", prevCalcGrad.begin(), prevCalcGrad.end());
-	  //output("adjVec=", adjVec.begin(), adjVec.end());
+  std::vector<float> numericalGrad(inputSize);
+  for (size_t i = 0; i < numericalGrad.size(); ++i) {
+	  numericalGrad[i] = (newVal[i] - sumValOrig) / delta;
+  }

-	  // set grad results
-	  thrust::copy(numericalGrad.begin(), numericalGrad.end(), grad.begin());
-	  cerr << "numericalGrad=" << grad.Debug() << endl;
-	  //output("numericalGrad", numericalGrad);
+  broadcast(numericalGrad, adjVec);
+  //std::cerr << "broadcast size=" << numericalGrad.size() << " " << adjVec.size() << std::endl;
+  //output("adjVec=", adjVec.begin(), adjVec.end());

-	  // print out diff between origGrad and numericalGrad
-	  std::vector<float> diff(inputSize);
-	  for (size_t i = 0; i < origGrad.size(); ++i) {
-		  diff[i] = origGrad[i] - numericalGrad[i];
-	  }
-	  cerr << "L2-norm of difference=" << L2Norm(diff) << endl << endl;
+  for (size_t i = 0; i < numericalGrad.size(); ++i) {
+	  numericalGrad[i] *= adjVec[i];
+	  numericalGrad[i] += prevCalcGrad[i];
+  }

-	  // put back origGrad
-	  thrust::copy(origGrad.begin(), origGrad.end(), grad.begin());
+  //output("prevCalcGrad=", prevCalcGrad.begin(), prevCalcGrad.end());
+  //output("adjVec=", adjVec.begin(), adjVec.end());
+
+  // set grad results
+  thrust::copy(numericalGrad.begin(), numericalGrad.end(), grad.begin());
+  cerr << "numericalGrad=" << grad.Debug() << endl;
+  //output("numericalGrad", numericalGrad);
+
+  // print out diff between origGrad and numericalGrad
+  std::vector<float> diff(inputSize);
+  for (size_t i = 0; i < origGrad.size(); ++i) {
+	  diff[i] = origGrad[i] - numericalGrad[i];
+  }
+  cerr << "L2-norm of difference=" << L2Norm(diff) << endl << endl;
+
+  // put back origGrad
+  thrust::copy(origGrad.begin(), origGrad.end(), grad.begin());
 }

 float Node::L2Norm(const std::vector<float> &vec) const
--- a/src/node.h
+++ b/src/node.h
@ -75,7 +75,7 @@ class Node : public Chainable<Tensor>,
      }
    }
    
-    virtual Tensor &val()  {
+    virtual const Tensor &val()  {
      UTIL_THROW_IF2(!val_, "Tensor has not been allocated");
      return val_;
    };
--- a/src/node_operators.h
+++ b/src/node_operators.h
@ -41,7 +41,7 @@ struct InputNode : public Node {
    val_ = t;
    shape_ = t.shape();
    //@todo, shape checking
-  };
+  }

  void forward() {}
  void backward() {}
@ -50,7 +50,7 @@ struct InputNode : public Node {
    std::stringstream ss;
    ss << "\"" << this << "\" [shape=\"circle\", label=" << label("input") << ", style=\"filled\", fillcolor=\"lawngreen\"]" << std::endl << std::endl;
    return ss.str();
-  };
+  }

 };

@ -70,7 +70,7 @@ struct ConstantNode : public Node {
    std::stringstream ss;
    ss << "\"" << this << "\" [shape=\"diamond\", label=" << label("const") << "]" << std::endl << std::endl;
    return ss.str();
-  };
+  }

 };

@ -86,6 +86,12 @@ struct ParamNode : public Node {
                   "Param items require shape information");
  }

+  virtual void setVal(Tensor t)  {
+    val_ = t;
+    shape_ = t.shape();
+    //@todo, shape checking
+  };
+
  void forward() {}
  void backward() {}
  
--- a/src/node_operators_binary.h
+++ b/src/node_operators_binary.h
@ -73,7 +73,7 @@ struct DotNodeOp : public BinaryNodeOp {

  virtual std::string graphviz() {
    std::stringstream ss;
-    ss << "\"" << this << "\" [shape=\"box\", label=" << label("×")
+    ss << "\"" << this << "\" [shape=\"box\", label=" << label("•")
      << ", style=\"filled\", fillcolor=\"orange\"]" << std::endl;
    ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl;
    ss << "\"" << b_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
@ -185,7 +185,7 @@ struct MultNodeOp : public BinaryNodeOp {

  virtual std::string graphviz() {
    std::stringstream ss;
-    ss << "\"" << this << "\" [shape=\"box\", label=" << label("•")
+    ss << "\"" << this << "\" [shape=\"box\", label=" << label("x")
      << ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
    ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl;
    ss << "\"" << b_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
--- a/src/test_nodes.cu
+++ b/src/test_nodes.cu
@ -30,7 +30,6 @@ int main(int argc, char** argv)
  Expr labelExpr = g.input(shape={batch_size, output_size});

  Expr inExpr2 = g.input(shape={batch_size, input_size});
-  Expr inExpr3 = g.input(shape={input_size, batch_size});

  vector<Expr> expr;

@ -48,11 +47,15 @@ int main(int argc, char** argv)
  expr.emplace_back(relu(expr.back()));
  expr.emplace_back(log(expr.back()));
  expr.emplace_back(exp(expr.back()));
+  expr.emplace_back(dropout(expr.back()));
+  //expr.emplace_back(softmax_slow(expr.back()));
  expr.emplace_back(softmax(expr.back()));

  Expr ceExpr = cross_entropy(expr.back(), labelExpr);
  Expr cost = mean(ceExpr, axis=0);

+  std::cout << g.graphviz() << std::endl;
+
  // create data
  //srand(0);
  srand(time(NULL));
@ -79,18 +82,11 @@ int main(int argc, char** argv)

  inExpr2 = inTensor2;

-  Tensor inTensor3({input_size, batch_size});
-  thrust::copy(values2.begin(), values2.end(), inTensor3.begin());
-
-  inExpr3 = inTensor3;
-
  // train
  g.forward(batch_size);
  //g.backward();
  g.backward_debug(0.001);

-  std::cout << g.graphviz() << std::endl;
-
  /*
  std::cerr << "inTensor=" << inTensor.Debug() << std::endl;

--- a/src/validate_encoder_decoder.cu
+++ b/src/validate_encoder_decoder.cu
@ -184,73 +184,53 @@ int main(int argc, char** argv) {
  std::cerr << "Building the encoder-decoder computation graph..." << std::endl;

  // Build the encoder-decoder computation graph.
+  int num_training_examples = source_sentences.size();
+  int num_batches = num_training_examples / batch_size;
+  std::cerr << num_training_examples << " training examples." << std::endl;
  int embedding_size = 50;
  int hidden_size = 100;
-  ExpressionGraph g = build_graph(source_vocab.Size(),
-                                  target_vocab.Size(),
-                                  embedding_size,
-                                  hidden_size,
-                                  num_source_tokens-1,
-                                  num_target_tokens-1);
-
-  std::cerr << "Attaching the data to the computation graph..." << std::endl;
-
-  // Convert the data to dense one-hot vectors.
-  // TODO: make the graph handle sparse indices with a proper lookup layer.
-  for (int t = 0; t < num_source_tokens; ++t) {
-    Tensor Xt({batch_size, static_cast<int>(source_vocab.Size())});
-    std::vector<float> values(batch_size * source_vocab.Size(), 0.0);
-    int k = 0;
-    for (int i = 0; i < batch_size; ++i) {
-      values[k + source_sentences[i][t]] = 1.0;
-      k += source_vocab.Size();
-    }
-    thrust::copy(values.begin(), values.end(), Xt.begin());
-    // Attach this slice to the graph.
-    std::stringstream ss;
-    ss << "X" << t;
-    g[ss.str()] = Xt;
-  }
-
-  for (int t = 0; t < num_target_tokens; ++t) {
-    Tensor Yt({batch_size, static_cast<int>(target_vocab.Size())});
-    std::vector<float> values(batch_size * target_vocab.Size(), 0.0);
-    int k = 0;
-    for (int i = 0; i < batch_size; ++i) {
-      values[k + target_sentences[i][t]] = 1.0;
-      k += target_vocab.Size();
-    }
-    thrust::copy(values.begin(), values.end(), Yt.begin());
-    // Attach this slice to the graph.
-    std::stringstream ss;
-    ss << "Y" << t;
-    g[ss.str()] = Yt;
+  std::vector<ExpressionGraph> graphs;
+  for (int b = 0; b < num_batches; ++b) {
+    ExpressionGraph g = build_graph(source_vocab.Size(),
+                                    target_vocab.Size(),
+                                    embedding_size,
+                                    hidden_size,
+                                    num_source_tokens-1,
+                                    num_target_tokens-1);
+    graphs.push_back(g);
  }

  std::cerr << "Printing the computation graph..." << std::endl;
  std::ofstream viz("encoder_decoder.dot");
-  viz << g.graphviz() << std::endl;
+  viz << graphs[0].graphviz() << std::endl;
  viz.close();

  std::cerr << "Training..." << std::endl;

-  int num_training_examples = source_sentences.size();
-  std::cerr << num_training_examples << " training examples." << std::endl;
-
  boost::timer::cpu_timer total;
  Adam opt;
  int num_epochs = 20;
+  int b0 = -1;
  for(int epoch = 1; epoch <= num_epochs; ++epoch) {
    boost::timer::cpu_timer timer;
    // TODO: shuffle the batches.
    // shuffle(trainImages, trainLabels, IMAGE_SIZE, LABEL_SIZE);
    std::vector<size_t> indices;
-    int num_batches = num_training_examples / batch_size;
    random_permutation(num_batches, &indices);
    float cost = 0;
    for(int j = 0; j < num_batches; j++) {
      int b = indices[j]; // Batch index.
-      // Attaching the data to the computation graph...
+      if (b0 < 0) b0 = b;
+      //ExpressionGraph g = graphs[b];
+      ExpressionGraph g = graphs[b0];
+      // Share the parameters.
+      if (false && b != b0) {
+        for (int i = 0; i < g.params().size(); ++i) {
+          g.params()[i].setVal(graphs[b0].params()[i].val());
+        }
+      }
+
+      // Attach the data to the computation graph.
      // Convert the data to dense one-hot vectors.
      // TODO: make the graph handle sparse indices with a proper lookup layer.
      // TODO: use different sentence lengths for the batches.