preparing mnist benchmark

2024-09-17 09:47:34 +03:00 · 2016-09-17 19:52:56 +02:00 · 2016-09-17 19:52:56 +02:00 · bfd9c369f8
commit bfd9c369f8
parent 1d11b4a40e
8 changed files with 198 additions and 150 deletions
--- a/scripts/train_test_model_multi.py
+++ b/scripts/train_test_model_multi.py
@ -3,6 +3,7 @@
 import sys
 import os
 import numpy as np
 import time
 from keras.datasets import mnist
 from keras.utils import np_utils
 from keras.models import Sequential
@ -15,7 +16,11 @@ def softmax(x):
 def baseline_model(pixels_count, classes_count):
    model = Sequential()
-    model.add(Dense(100, input_dim=pixels_count, init='normal', activation='tanh'))
+    model.add(Dense(2000, input_dim=pixels_count, init='normal', activation='tanh'))
    model.add(Dense(2000, init='normal', activation='tanh'))
    model.add(Dense(2000, init='normal', activation='tanh'))
    model.add(Dense(2000, init='normal', activation='tanh'))
    model.add(Dense(2000, init='normal', activation='tanh'))
    model.add(Dense(classes_count, input_dim=100, init='normal', activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
@ -52,21 +57,24 @@ if __name__ == "__main__":
    # Build the model
    model = baseline_model(pixels_count, classes_count)
    # Fit the model
-    model.fit(X_train, y_train, validation_data=(X_test, y_test), nb_epoch=10, batch_size=200, verbose=2)
+    
    start = time.time();
    model.fit(X_train, y_train, nb_epoch=10, batch_size=200, verbose=2)
    print "Time elapsed", time.time() - start, "s"
    # Final evaluation of the model
    scores = model.evaluate(X_test, y_test, verbose=0)
-    print("Baseline Error: %.2f%%" % (100-scores[1]*100))
+    print("Accuracy: %.2f%%" % (scores[1] * 100))
    ### Weight and bias matrixes - we extract them from the model
    # weights_ones = np.ones((pixels_count, classes_count))
    # print weights_ones.shape
-    weights1, bias1, weights2, bias2 = model.get_weights()
+    #weights1, bias1, weights2, bias2 = model.get_weights()
    ### Save model to npz files
-    if not os.path.exists("test_model_multi"):
+    #if not os.path.exists("test_model_multi"):
-        os.makedirs("test_model_multi")
+    #    os.makedirs("test_model_multi")
    # np.savez("test_model_multi/model", *model)
-    np.savez("test_model_multi/model", weights1 = weights1, bias1 = bias1, weights2 = weights2, bias2 = bias2)
+    #np.savez("test_model_multi/model", weights1 = weights1, bias1 = bias1, weights2 = weights2, bias2 = bias2)
-    print "Model saved! Check test_model_multi directory"
+    #print "Model saved! Check test_model_multi directory"
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -21,30 +21,25 @@ cuda_add_executable(
 target_link_libraries(marian marian_lib)
 cuda_add_executable(
-  train_mnist
+  mnist_benchmark
-  train_mnist.cu
+  mnist_benchmark.cu
 )
 target_link_libraries(train_mnist marian_lib)
 cuda_add_executable(
  validate_mnist
  validate_mnist.cu
 )
 cuda_add_executable(
  validate_mnist_batch
  validate_mnist_batch.cu
 )
 cuda_add_executable(
  validate_encoder_decoder
  validate_encoder_decoder.cu
 )
-target_link_libraries(validate_mnist marian_lib)
+target_link_libraries(mnist_benchmark marian_lib)
 target_link_libraries(validate_mnist_batch marian_lib)
 target_link_libraries(validate_encoder_decoder marian_lib)
-foreach(exec marian train_mnist validate_mnist validate_mnist_batch validate_encoder_decoder)
+foreach(exec marian mnist_benchmark validate_mnist_batch validate_encoder_decoder)
  target_link_libraries(${exec} ${EXT_LIBS} cuda cudnn)
  cuda_add_cublas_to_target(${exec})
  set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}")
--- a/src/mnist_benchmark.cu
+++ b/src/mnist_benchmark.cu
@ -0,0 +1,164 @@
 #include <algorithm>
 #include <chrono>
 #include <boost/timer/timer.hpp>
 #include "marian.h"
 #include "mnist.h"
 #include "npz_converter.h"
 #include "optimizers.h"
 using namespace marian;
 using namespace keywords;
 const size_t IMAGE_SIZE = 784;
 const size_t LABEL_SIZE = 10;
 int BATCH_SIZE = 200;
 ExpressionGraph build_graph(const std::vector<int>& dims) {
  std::cerr << "Building model... ";
  boost::timer::cpu_timer timer;
  ExpressionGraph g;
  auto x = named(g.input(shape={whatevs, IMAGE_SIZE}), "x");
  auto y = named(g.input(shape={whatevs, LABEL_SIZE}), "y");
  std::vector<Expr> layers, weights, biases;
  for(int i = 0; i < dims.size()-1; ++i) {
    int in = dims[i];
    int out = dims[i+1];
    if(i == 0) {
      layers.emplace_back(x);
    }
    else {
      layers.emplace_back(tanh(dot(layers.back(), weights.back())) + biases.back());
    }
    weights.emplace_back(
      g.param(shape={in, out},
            init=normal()));
    biases.emplace_back(
      g.param(shape={1, out},
            init=normal()));
  }
  auto probs = named(
    softmax(dot(layers.back(), weights.back()) + biases.back()),
    "probs"
  );
  auto cost = -mean(sum(y * log(probs), axis=1), axis=0);
  auto costreg = named(
    cost, "cost"
  );
  std::cerr << timer.format(5, "%ws") << std::endl;
  return g;
 }
 void shuffle(std::vector<float>& x, std::vector<float>& y, size_t dimx, size_t dimy) {
  std::srand(std::time(0));
  std::vector<size_t> ind;
  for(size_t i = 0; i < x.size() / dimx; ++i) {
    ind.push_back(i);
  }
  std::random_shuffle(ind.begin(), ind.end());
  std::vector<float> xShuffled(x.size());
  std::vector<float> yShuffled(y.size());
  int j = 0;
  for(auto i : ind) {
    std::copy(x.begin() + j * dimx, x.begin() + j * dimx + dimx, xShuffled.begin() + i * dimx);
    std::copy(y.begin() + j * dimy, y.begin() + j * dimy + dimy, yShuffled.begin() + i * dimy);
    j++;
  }
  x = xShuffled;
  y = yShuffled;
 }
 int main(int argc, char** argv) {
  int trainRows, testRows;
  std::cerr << "Loading train set...";
  std::vector<float> trainImages = datasets::mnist::ReadImages("../examples/mnist/train-images-idx3-ubyte", trainRows, IMAGE_SIZE);
  std::vector<float> trainLabels = datasets::mnist::ReadLabels("../examples/mnist/train-labels-idx1-ubyte", trainRows, LABEL_SIZE);
  std::cerr << "Done." << std::endl;
  std::cerr << "Loading test set...";
  std::vector<float> testImages = datasets::mnist::ReadImages("../examples/mnist/t10k-images-idx3-ubyte", testRows, IMAGE_SIZE);
  std::vector<float> testLabels = datasets::mnist::ReadLabels("../examples/mnist/t10k-labels-idx1-ubyte", testRows, LABEL_SIZE);
  std::cerr << "Done." << std::endl;
  ExpressionGraph g = build_graph({IMAGE_SIZE, 2000, 2000, 2000, 2000, 2000, LABEL_SIZE});
  Tensor xt({BATCH_SIZE, IMAGE_SIZE});
  Tensor yt({BATCH_SIZE, LABEL_SIZE});
  boost::timer::cpu_timer total;
  Adam opt;
  for(int i = 1; i <= 10; ++i) {
    boost::timer::cpu_timer timer;
    shuffle(trainImages, trainLabels, IMAGE_SIZE, LABEL_SIZE);
    float cost = 0;
    for(int j = 0; j < trainRows / BATCH_SIZE; j++) {
      size_t xBatch = IMAGE_SIZE * BATCH_SIZE;
      auto xbegin = trainImages.begin() + j * xBatch;
      auto xend = xbegin + xBatch;
      xt.set(xbegin, xend);
      size_t yBatch = LABEL_SIZE * BATCH_SIZE;
      auto ybegin = trainLabels.begin() + j * yBatch;
      auto yend = ybegin + yBatch;
      yt.set(ybegin, yend);
      g["x"] = xt;
      g["y"] = yt;
      opt(g, BATCH_SIZE);
      cost += g["cost"].val()[0];
    }
    std::cerr << "Epoch: " << i << " - Cost: " << cost / trainRows * BATCH_SIZE << " - " << timer.format(3, "%ws") << std::endl;
  }
  std::cerr << "Total: " << total.format(3, "%ws") << std::endl;
  std::vector<float> results;  
  for(int j = 0; j < testRows / BATCH_SIZE; j++) {
    size_t xBatch = IMAGE_SIZE * BATCH_SIZE;
    auto xbegin = testImages.begin() + j * xBatch;
    auto xend = xbegin + xBatch;
    xt.set(xbegin, xend);
    yt.set(0);  
    g["x"] = xt;
    g["y"] = yt;
    g.forward(BATCH_SIZE);
    std::vector<float> bResults;
    bResults << g["probs"].val();
    results.insert(results.end(), bResults.begin(), bResults.end());
  }
  size_t acc = 0;
  for (size_t i = 0; i < testLabels.size(); i += LABEL_SIZE) {
    size_t correct = 0;
    size_t proposed = 0;
    for (size_t j = 0; j < LABEL_SIZE; ++j) {
      if (testLabels[i + j])
        correct = j;
      if (results[i + j] > results[i + proposed])
        proposed = j;
    }
    acc += (correct == proposed);
  }
  std::cerr << "Accuracy: " << float(acc) / testRows << std::endl;
  return 0;
 }
--- a/src/optimizers.h
+++ b/src/optimizers.h
@ -31,7 +31,7 @@ class Sgd {
 // @TODO: Add serialization for historic gradients and parameters
 class Adagrad {
  public:
-    Adagrad(float eta=0.01, float eps=10e-8)
+    Adagrad(float eta=0.01, float eps=1e-8)
    : eta_(eta), eps_(eps) {}
    void operator()(ExpressionGraph& graph, int batchSize) {
@ -61,7 +61,7 @@ class Adagrad {
 // https://arxiv.org/pdf/1412.6980v8.pdf
 class Adam {
  public:
-    Adam(float eta=0.001, float beta1=0.999, float beta2=0.999, float eps=10e-8)
+    Adam(float eta=0.001, float beta1=0.9, float beta2=0.999, float eps=1e-8)
    : eta_(eta), beta1_(beta1), beta2_(beta2), eps_(eps), t_(0) {}
    void operator()(ExpressionGraph& graph, int batchSize) {
@ -101,4 +101,4 @@ class Adam {
    std::vector<Tensor> vt_;
 };
-}
+}
--- a/src/param_initializers.h
+++ b/src/param_initializers.h
@ -33,13 +33,13 @@ void distribution(Tensor t, float a, float b) {
  t << vals;
 }
-std::function<void(Tensor)> normal(float mean = 0.0, float std = 0.1) {
+std::function<void(Tensor)> normal(float mean = 0.0, float std = 0.05) {
  return [mean, std](Tensor t) {
    distribution<std::normal_distribution<float>>(t, mean, std);
  }; 
 }
-std::function<void(Tensor)> uniform(float a = 0.0, float b = 0.1) {
+std::function<void(Tensor)> uniform(float a = 0.0, float b = 0.05) {
  return [a, b](Tensor t) {
    distribution<std::uniform_real_distribution<float>>(t, a, b);
  };  
--- a/src/tensor_operators.cu
+++ b/src/tensor_operators.cu
@ -4,6 +4,14 @@ using namespace std;
 namespace marian {
 // @TODO: handle this better, maybe per thread?
 static cublasHandle_t create_handle() {
  cublasHandle_t cublasHandle;
  cublasCreate(&cublasHandle);
  return cublasHandle;
 }
 cublasHandle_t cublasHandle = create_handle();
 __global__ void gSubtractMean(float* out, float* weights,
                              size_t rows, size_t cols) {
  for(int bid = 0; bid < rows; bid += gridDim.x) {
@ -212,10 +220,7 @@ Tensor Prod(cublasHandle_t handle, Tensor C, const Tensor A, const Tensor B,
 Tensor Prod(Tensor C, const Tensor A, const Tensor B,
             bool transA, bool transB, Float beta) {
  cublasHandle_t cublasHandle;
  cublasCreate(&cublasHandle);  
  Tensor temp = Prod(cublasHandle, C, A, B, transA, transB, beta);
  cublasDestroy(cublasHandle);
  return temp;
 }
--- a/src/train_mnist.cu
+++ b/src/train_mnist.cu
@ -1,34 +0,0 @@
 #include "marian.h"
 #include "mnist.h"
 #include "optimizers.h"
 int main(int argc, char** argv) {
  const size_t IMAGE_SIZE = 784;
  const size_t LABEL_SIZE = 10;
  int numofdata;
  std::vector<float> trainImages = datasets::mnist::ReadImages("../examples/mnist/t10k-images-idx3-ubyte", numofdata, IMAGE_SIZE);
  std::vector<float> trainLabels = datasets::mnist::ReadLabels("../examples/mnist/t10k-labels-idx1-ubyte", numofdata, LABEL_SIZE);
  using namespace marian;
  using namespace keywords;
  ExpressionGraph g;
  Expr x = named(g.input(shape={whatevs, IMAGE_SIZE}), "x");
  Expr y = named(g.input(shape={whatevs, LABEL_SIZE}), "y");
  Expr w = named(g.param(shape={IMAGE_SIZE, LABEL_SIZE}), "w");
  Expr b = named(g.param(shape={1, LABEL_SIZE}), "b");
  auto scores = dot(x, w) + b;
  auto lr = softmax(scores);
  auto cost = named(-mean(sum(y * log(lr), axis=1), axis=0), "cost");
  std::cerr << "lr=" << lr.Debug() << std::endl;
  Adam opt;
  opt(g, 300);
  return 0;
 }
--- a/src/validate_mnist.cu
+++ b/src/validate_mnist.cu
@ -1,90 +0,0 @@
 #include "marian.h"
 #include "mnist.h"
 #include "npz_converter.h"
 #include "optimizers.h"
 using namespace marian;
 using namespace keywords;
 const size_t IMAGE_SIZE = 784;
 const size_t LABEL_SIZE = 10;
 int BATCH_SIZE = 10000;
 ExpressionGraph build_graph() {
  std::cerr << "Loading model params...";
  NpzConverter converter("../scripts/test_model_single/model.npz");
  std::vector<float> wData, bData;
  Shape wShape, bShape;
  converter.Load("weights", wData, wShape);
  converter.Load("bias", bData, bShape);
  std::cerr << "Done." << std::endl;
  std::cerr << "Building model...";
  ExpressionGraph g;
  auto x = named(g.input(shape={whatevs, IMAGE_SIZE}), "x");
  auto y = named(g.input(shape={whatevs, LABEL_SIZE}), "y");
  auto w = named(g.param(shape={IMAGE_SIZE, LABEL_SIZE},
                         init=from_vector(wData)), "w");
  auto b = named(g.param(shape={1, LABEL_SIZE},
                         init=from_vector(bData)), "b");
  auto probs = named(
    softmax(dot(x, w) + b),
    "probs"
  );
  auto cost = named(
    -mean(sum(y * log(probs), axis=1), axis=0),
    "cost"
  );
  std::cerr << "Done." << std::endl;
  return g;
 }
 int main(int argc, char** argv) {
  std::cerr << "Loading test set...";
  std::vector<float> testImages = datasets::mnist::ReadImages("../examples/mnist/t10k-images-idx3-ubyte", BATCH_SIZE, IMAGE_SIZE);
  std::vector<float> testLabels = datasets::mnist::ReadLabels("../examples/mnist/t10k-labels-idx1-ubyte", BATCH_SIZE, LABEL_SIZE);
  std::cerr << "Done." << std::endl;
  ExpressionGraph g = build_graph();
  Tensor xt({BATCH_SIZE, IMAGE_SIZE});
  Tensor yt({BATCH_SIZE, LABEL_SIZE});
  g["x"] = (xt << testImages);
  g["y"] = (yt << testLabels);
  Adagrad opt;
  for(size_t j = 0; j < 20; ++j) {
    for(size_t i = 0; i < 60; ++i) {
      opt(g, BATCH_SIZE);
    }
    std::cerr << g["cost"].val()[0] << std::endl;
  }
  std::vector<float> results;
  results << g["probs"].val();
  size_t acc = 0;
  for (size_t i = 0; i < testLabels.size(); i += LABEL_SIZE) {
    size_t correct = 0;
    size_t proposed = 0;
    for (size_t j = 0; j < LABEL_SIZE; ++j) {
      if (testLabels[i+j])
        correct = j;
      if (results[i + j] > results[i + proposed])
        proposed = j;
    }
    acc += (correct == proposed);
  }
  std::cerr << "Cost: " << g["cost"].val()[0] <<  " - Accuracy: " << float(acc) / BATCH_SIZE << std::endl;
  return 0;
 }