mirror of
https://github.com/marian-nmt/marian.git
synced 2024-09-17 09:47:34 +03:00
Merge branch 'master' of github.com:emjotde/marian
This commit is contained in:
commit
ade449a06a
@ -48,10 +48,10 @@ struct Chainable {
|
||||
virtual const std::string label(const std::string& type) = 0;
|
||||
|
||||
virtual const Shape& shape() = 0;
|
||||
virtual DataType &val() = 0;
|
||||
virtual const DataType &val() = 0;
|
||||
virtual DataType grad() = 0;
|
||||
virtual void setVal(DataType t) {
|
||||
UTIL_THROW2("Tensors can only be assigned to input nodes");
|
||||
UTIL_THROW2("Tensors can only be assigned to input and parameter nodes");
|
||||
};
|
||||
};
|
||||
|
||||
|
@ -33,6 +33,10 @@ Tensor Expr::val() {
|
||||
return pimpl_->val();
|
||||
}
|
||||
|
||||
void Expr::setVal(const Tensor &val) {
|
||||
pimpl_->setVal(val);
|
||||
}
|
||||
|
||||
Tensor Expr::grad() {
|
||||
return pimpl_->grad();
|
||||
}
|
||||
|
@ -48,6 +48,8 @@ class Expr {
|
||||
Tensor val();
|
||||
Tensor grad();
|
||||
|
||||
void setVal(const Tensor &val);
|
||||
|
||||
ExpressionGraphPtr graph();
|
||||
|
||||
ChainPtr node();
|
||||
|
138
src/node.cu
138
src/node.cu
@ -12,97 +12,97 @@ void Node::calc_numeric_grad(
|
||||
{
|
||||
using namespace std;
|
||||
|
||||
size_t inputSize = GetTotalSize(input.shape());
|
||||
size_t valSize = GetTotalSize(val_.shape());
|
||||
size_t inputSize = GetTotalSize(input.shape());
|
||||
size_t valSize = GetTotalSize(val_.shape());
|
||||
|
||||
UTIL_THROW_IF2(inputSize != GetTotalSize(grad.shape()),
|
||||
"inputSize != gradSize:" << inputSize << "!=" << GetTotalSize(grad.shape()));
|
||||
UTIL_THROW_IF2(valSize != GetTotalSize(adj_.shape()),
|
||||
"valSize != adjSize :" << valSize << "!=" << GetTotalSize(adj_.shape()));
|
||||
UTIL_THROW_IF2(inputSize != GetTotalSize(grad.shape()),
|
||||
"inputSize != gradSize:" << inputSize << "!=" << GetTotalSize(grad.shape()));
|
||||
UTIL_THROW_IF2(valSize != GetTotalSize(adj_.shape()),
|
||||
"valSize != adjSize :" << valSize << "!=" << GetTotalSize(adj_.shape()));
|
||||
|
||||
cerr << "inputSize=grad=" << Debug(input.shape())<< "=" << inputSize << " "
|
||||
<< "valSize=adj_=" << Debug(val_.shape()) << "=" << valSize
|
||||
<< endl;
|
||||
cerr << "inputSize=grad=" << Debug(input.shape())<< "=" << inputSize << " "
|
||||
<< "valSize=adj_=" << Debug(val_.shape()) << "=" << valSize
|
||||
<< endl;
|
||||
|
||||
//cerr << "input=" << input.Debug() << endl;
|
||||
//cerr << "adj_=" << adj_.Debug() << endl;
|
||||
//cerr << "input=" << input.Debug() << endl;
|
||||
//cerr << "adj_=" << adj_.Debug() << endl;
|
||||
|
||||
std::vector<float> origGrad(inputSize);
|
||||
thrust::copy(grad.begin(), grad.end(), origGrad.begin());
|
||||
cerr << "origGrad=" << grad.Debug() << endl;
|
||||
//output("diffGrad", diffGrad);
|
||||
std::vector<float> origGrad(inputSize);
|
||||
thrust::copy(grad.begin(), grad.end(), origGrad.begin());
|
||||
cerr << "origGrad=" << grad.Debug() << endl;
|
||||
//output("diffGrad", diffGrad);
|
||||
|
||||
//output("prevCalcGrad", prevCalcGrad.begin(), prevCalcGrad.end());
|
||||
//output("prevCalcGrad", prevCalcGrad.begin(), prevCalcGrad.end());
|
||||
|
||||
std::vector<float> inputVec(inputSize);
|
||||
thrust::copy(input.begin(), input.end(), inputVec.begin());
|
||||
//output("inputVec", inputVec);
|
||||
std::vector<float> inputVec(inputSize);
|
||||
thrust::copy(input.begin(), input.end(), inputVec.begin());
|
||||
//output("inputVec", inputVec);
|
||||
|
||||
std::vector<float> newVal(inputSize, 0);
|
||||
std::vector<float> newVal(inputSize, 0);
|
||||
|
||||
// LOOP thru each element in input & add delta
|
||||
for (size_t inputInd = 0; inputInd < inputSize; ++inputInd) {
|
||||
inputVec[inputInd] += delta;
|
||||
thrust::copy(inputVec.begin(), inputVec.end(), input.begin());
|
||||
//output("input", input.begin(), input.end());
|
||||
|
||||
forward();
|
||||
|
||||
for (size_t i = 0; i < valSize; ++i) {
|
||||
newVal[inputInd] += val_[i];
|
||||
}
|
||||
//output("val_", val_.begin(), val_.end());
|
||||
|
||||
inputVec[inputInd] -= delta;
|
||||
}
|
||||
|
||||
// orig value
|
||||
// LOOP thru each element in input & add delta
|
||||
for (size_t inputInd = 0; inputInd < inputSize; ++inputInd) {
|
||||
inputVec[inputInd] += delta;
|
||||
thrust::copy(inputVec.begin(), inputVec.end(), input.begin());
|
||||
//output("input", input.begin(), input.end());
|
||||
|
||||
forward();
|
||||
|
||||
float sumValOrig = 0;
|
||||
for (size_t i = 0; i < valSize; ++i) {
|
||||
sumValOrig += val_[i];
|
||||
newVal[inputInd] += val_[i];
|
||||
}
|
||||
//output("val_", val_.begin(), val_.end());
|
||||
|
||||
//output("newVal", newVal.begin(), newVal.end());
|
||||
inputVec[inputInd] -= delta;
|
||||
}
|
||||
|
||||
// calc gradient
|
||||
//cerr << "adj_=" << adj_.Debug() << endl;
|
||||
std::vector<float> adjVec(valSize);
|
||||
thrust::copy(adj_.begin(), adj_.end(), adjVec.begin());
|
||||
// orig value
|
||||
thrust::copy(inputVec.begin(), inputVec.end(), input.begin());
|
||||
forward();
|
||||
|
||||
std::vector<float> numericalGrad(inputSize);
|
||||
for (size_t i = 0; i < numericalGrad.size(); ++i) {
|
||||
numericalGrad[i] = (newVal[i] - sumValOrig) / delta;
|
||||
}
|
||||
float sumValOrig = 0;
|
||||
for (size_t i = 0; i < valSize; ++i) {
|
||||
sumValOrig += val_[i];
|
||||
}
|
||||
|
||||
broadcast(numericalGrad, adjVec);
|
||||
//std::cerr << "broadcast size=" << numericalGrad.size() << " " << adjVec.size() << std::endl;
|
||||
//output("adjVec=", adjVec.begin(), adjVec.end());
|
||||
//output("newVal", newVal.begin(), newVal.end());
|
||||
|
||||
for (size_t i = 0; i < numericalGrad.size(); ++i) {
|
||||
numericalGrad[i] *= adjVec[i];
|
||||
numericalGrad[i] += prevCalcGrad[i];
|
||||
}
|
||||
// calc gradient
|
||||
//cerr << "adj_=" << adj_.Debug() << endl;
|
||||
std::vector<float> adjVec(valSize);
|
||||
thrust::copy(adj_.begin(), adj_.end(), adjVec.begin());
|
||||
|
||||
//output("prevCalcGrad=", prevCalcGrad.begin(), prevCalcGrad.end());
|
||||
//output("adjVec=", adjVec.begin(), adjVec.end());
|
||||
std::vector<float> numericalGrad(inputSize);
|
||||
for (size_t i = 0; i < numericalGrad.size(); ++i) {
|
||||
numericalGrad[i] = (newVal[i] - sumValOrig) / delta;
|
||||
}
|
||||
|
||||
// set grad results
|
||||
thrust::copy(numericalGrad.begin(), numericalGrad.end(), grad.begin());
|
||||
cerr << "numericalGrad=" << grad.Debug() << endl;
|
||||
//output("numericalGrad", numericalGrad);
|
||||
broadcast(numericalGrad, adjVec);
|
||||
//std::cerr << "broadcast size=" << numericalGrad.size() << " " << adjVec.size() << std::endl;
|
||||
//output("adjVec=", adjVec.begin(), adjVec.end());
|
||||
|
||||
// print out diff between origGrad and numericalGrad
|
||||
std::vector<float> diff(inputSize);
|
||||
for (size_t i = 0; i < origGrad.size(); ++i) {
|
||||
diff[i] = origGrad[i] - numericalGrad[i];
|
||||
}
|
||||
cerr << "L2-norm of difference=" << L2Norm(diff) << endl << endl;
|
||||
for (size_t i = 0; i < numericalGrad.size(); ++i) {
|
||||
numericalGrad[i] *= adjVec[i];
|
||||
numericalGrad[i] += prevCalcGrad[i];
|
||||
}
|
||||
|
||||
// put back origGrad
|
||||
thrust::copy(origGrad.begin(), origGrad.end(), grad.begin());
|
||||
//output("prevCalcGrad=", prevCalcGrad.begin(), prevCalcGrad.end());
|
||||
//output("adjVec=", adjVec.begin(), adjVec.end());
|
||||
|
||||
// set grad results
|
||||
thrust::copy(numericalGrad.begin(), numericalGrad.end(), grad.begin());
|
||||
cerr << "numericalGrad=" << grad.Debug() << endl;
|
||||
//output("numericalGrad", numericalGrad);
|
||||
|
||||
// print out diff between origGrad and numericalGrad
|
||||
std::vector<float> diff(inputSize);
|
||||
for (size_t i = 0; i < origGrad.size(); ++i) {
|
||||
diff[i] = origGrad[i] - numericalGrad[i];
|
||||
}
|
||||
cerr << "L2-norm of difference=" << L2Norm(diff) << endl << endl;
|
||||
|
||||
// put back origGrad
|
||||
thrust::copy(origGrad.begin(), origGrad.end(), grad.begin());
|
||||
}
|
||||
|
||||
float Node::L2Norm(const std::vector<float> &vec) const
|
||||
|
@ -75,7 +75,7 @@ class Node : public Chainable<Tensor>,
|
||||
}
|
||||
}
|
||||
|
||||
virtual Tensor &val() {
|
||||
virtual const Tensor &val() {
|
||||
UTIL_THROW_IF2(!val_, "Tensor has not been allocated");
|
||||
return val_;
|
||||
};
|
||||
|
@ -41,7 +41,7 @@ struct InputNode : public Node {
|
||||
val_ = t;
|
||||
shape_ = t.shape();
|
||||
//@todo, shape checking
|
||||
};
|
||||
}
|
||||
|
||||
void forward() {}
|
||||
void backward() {}
|
||||
@ -50,7 +50,7 @@ struct InputNode : public Node {
|
||||
std::stringstream ss;
|
||||
ss << "\"" << this << "\" [shape=\"circle\", label=" << label("input") << ", style=\"filled\", fillcolor=\"lawngreen\"]" << std::endl << std::endl;
|
||||
return ss.str();
|
||||
};
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
@ -70,7 +70,7 @@ struct ConstantNode : public Node {
|
||||
std::stringstream ss;
|
||||
ss << "\"" << this << "\" [shape=\"diamond\", label=" << label("const") << "]" << std::endl << std::endl;
|
||||
return ss.str();
|
||||
};
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
@ -86,6 +86,12 @@ struct ParamNode : public Node {
|
||||
"Param items require shape information");
|
||||
}
|
||||
|
||||
virtual void setVal(Tensor t) {
|
||||
val_ = t;
|
||||
shape_ = t.shape();
|
||||
//@todo, shape checking
|
||||
};
|
||||
|
||||
void forward() {}
|
||||
void backward() {}
|
||||
|
||||
|
@ -73,7 +73,7 @@ struct DotNodeOp : public BinaryNodeOp {
|
||||
|
||||
virtual std::string graphviz() {
|
||||
std::stringstream ss;
|
||||
ss << "\"" << this << "\" [shape=\"box\", label=" << label("×")
|
||||
ss << "\"" << this << "\" [shape=\"box\", label=" << label("•")
|
||||
<< ", style=\"filled\", fillcolor=\"orange\"]" << std::endl;
|
||||
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl;
|
||||
ss << "\"" << b_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
|
||||
@ -185,7 +185,7 @@ struct MultNodeOp : public BinaryNodeOp {
|
||||
|
||||
virtual std::string graphviz() {
|
||||
std::stringstream ss;
|
||||
ss << "\"" << this << "\" [shape=\"box\", label=" << label("•")
|
||||
ss << "\"" << this << "\" [shape=\"box\", label=" << label("x")
|
||||
<< ", style=\"filled\", fillcolor=\"yellow\"]" << std::endl;
|
||||
ss << "\"" << a_ << "\" -> \"" << this << "\"" << std::endl;
|
||||
ss << "\"" << b_ << "\" -> \"" << this << "\"" << std::endl << std::endl;
|
||||
|
@ -30,7 +30,6 @@ int main(int argc, char** argv)
|
||||
Expr labelExpr = g.input(shape={batch_size, output_size});
|
||||
|
||||
Expr inExpr2 = g.input(shape={batch_size, input_size});
|
||||
Expr inExpr3 = g.input(shape={input_size, batch_size});
|
||||
|
||||
vector<Expr> expr;
|
||||
|
||||
@ -48,11 +47,15 @@ int main(int argc, char** argv)
|
||||
expr.emplace_back(relu(expr.back()));
|
||||
expr.emplace_back(log(expr.back()));
|
||||
expr.emplace_back(exp(expr.back()));
|
||||
expr.emplace_back(dropout(expr.back()));
|
||||
//expr.emplace_back(softmax_slow(expr.back()));
|
||||
expr.emplace_back(softmax(expr.back()));
|
||||
|
||||
Expr ceExpr = cross_entropy(expr.back(), labelExpr);
|
||||
Expr cost = mean(ceExpr, axis=0);
|
||||
|
||||
std::cout << g.graphviz() << std::endl;
|
||||
|
||||
// create data
|
||||
//srand(0);
|
||||
srand(time(NULL));
|
||||
@ -79,18 +82,11 @@ int main(int argc, char** argv)
|
||||
|
||||
inExpr2 = inTensor2;
|
||||
|
||||
Tensor inTensor3({input_size, batch_size});
|
||||
thrust::copy(values2.begin(), values2.end(), inTensor3.begin());
|
||||
|
||||
inExpr3 = inTensor3;
|
||||
|
||||
// train
|
||||
g.forward(batch_size);
|
||||
//g.backward();
|
||||
g.backward_debug(0.001);
|
||||
|
||||
std::cout << g.graphviz() << std::endl;
|
||||
|
||||
/*
|
||||
std::cerr << "inTensor=" << inTensor.Debug() << std::endl;
|
||||
|
||||
|
@ -184,73 +184,53 @@ int main(int argc, char** argv) {
|
||||
std::cerr << "Building the encoder-decoder computation graph..." << std::endl;
|
||||
|
||||
// Build the encoder-decoder computation graph.
|
||||
int num_training_examples = source_sentences.size();
|
||||
int num_batches = num_training_examples / batch_size;
|
||||
std::cerr << num_training_examples << " training examples." << std::endl;
|
||||
int embedding_size = 50;
|
||||
int hidden_size = 100;
|
||||
ExpressionGraph g = build_graph(source_vocab.Size(),
|
||||
target_vocab.Size(),
|
||||
embedding_size,
|
||||
hidden_size,
|
||||
num_source_tokens-1,
|
||||
num_target_tokens-1);
|
||||
|
||||
std::cerr << "Attaching the data to the computation graph..." << std::endl;
|
||||
|
||||
// Convert the data to dense one-hot vectors.
|
||||
// TODO: make the graph handle sparse indices with a proper lookup layer.
|
||||
for (int t = 0; t < num_source_tokens; ++t) {
|
||||
Tensor Xt({batch_size, static_cast<int>(source_vocab.Size())});
|
||||
std::vector<float> values(batch_size * source_vocab.Size(), 0.0);
|
||||
int k = 0;
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
values[k + source_sentences[i][t]] = 1.0;
|
||||
k += source_vocab.Size();
|
||||
}
|
||||
thrust::copy(values.begin(), values.end(), Xt.begin());
|
||||
// Attach this slice to the graph.
|
||||
std::stringstream ss;
|
||||
ss << "X" << t;
|
||||
g[ss.str()] = Xt;
|
||||
}
|
||||
|
||||
for (int t = 0; t < num_target_tokens; ++t) {
|
||||
Tensor Yt({batch_size, static_cast<int>(target_vocab.Size())});
|
||||
std::vector<float> values(batch_size * target_vocab.Size(), 0.0);
|
||||
int k = 0;
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
values[k + target_sentences[i][t]] = 1.0;
|
||||
k += target_vocab.Size();
|
||||
}
|
||||
thrust::copy(values.begin(), values.end(), Yt.begin());
|
||||
// Attach this slice to the graph.
|
||||
std::stringstream ss;
|
||||
ss << "Y" << t;
|
||||
g[ss.str()] = Yt;
|
||||
std::vector<ExpressionGraph> graphs;
|
||||
for (int b = 0; b < num_batches; ++b) {
|
||||
ExpressionGraph g = build_graph(source_vocab.Size(),
|
||||
target_vocab.Size(),
|
||||
embedding_size,
|
||||
hidden_size,
|
||||
num_source_tokens-1,
|
||||
num_target_tokens-1);
|
||||
graphs.push_back(g);
|
||||
}
|
||||
|
||||
std::cerr << "Printing the computation graph..." << std::endl;
|
||||
std::ofstream viz("encoder_decoder.dot");
|
||||
viz << g.graphviz() << std::endl;
|
||||
viz << graphs[0].graphviz() << std::endl;
|
||||
viz.close();
|
||||
|
||||
std::cerr << "Training..." << std::endl;
|
||||
|
||||
int num_training_examples = source_sentences.size();
|
||||
std::cerr << num_training_examples << " training examples." << std::endl;
|
||||
|
||||
boost::timer::cpu_timer total;
|
||||
Adam opt;
|
||||
int num_epochs = 20;
|
||||
int b0 = -1;
|
||||
for(int epoch = 1; epoch <= num_epochs; ++epoch) {
|
||||
boost::timer::cpu_timer timer;
|
||||
// TODO: shuffle the batches.
|
||||
// shuffle(trainImages, trainLabels, IMAGE_SIZE, LABEL_SIZE);
|
||||
std::vector<size_t> indices;
|
||||
int num_batches = num_training_examples / batch_size;
|
||||
random_permutation(num_batches, &indices);
|
||||
float cost = 0;
|
||||
for(int j = 0; j < num_batches; j++) {
|
||||
int b = indices[j]; // Batch index.
|
||||
// Attaching the data to the computation graph...
|
||||
if (b0 < 0) b0 = b;
|
||||
//ExpressionGraph g = graphs[b];
|
||||
ExpressionGraph g = graphs[b0];
|
||||
// Share the parameters.
|
||||
if (false && b != b0) {
|
||||
for (int i = 0; i < g.params().size(); ++i) {
|
||||
g.params()[i].setVal(graphs[b0].params()[i].val());
|
||||
}
|
||||
}
|
||||
|
||||
// Attach the data to the computation graph.
|
||||
// Convert the data to dense one-hot vectors.
|
||||
// TODO: make the graph handle sparse indices with a proper lookup layer.
|
||||
// TODO: use different sentence lengths for the batches.
|
||||
|
Loading…
Reference in New Issue
Block a user