From b090b5f073174f52dd16f9b7beccd5b4a43e06d6 Mon Sep 17 00:00:00 2001
From: Huw Campbell <huw.campbell@ambiata.com>
Date: Thu, 8 Dec 2016 01:16:20 +1100
Subject: [PATCH] Make things faster

---
 grenade.cabal                              |   1 +
 src/Grenade/Core/Network.hs                |   2 +-
 src/Grenade/Core/Runner.hs                 |   4 +-
 src/Grenade/Layers/Convolution.hs          | 103 ++-------
 src/Grenade/Layers/Convolution/Internal.hs | 238 +++++++++++++++++++++
 src/Grenade/Layers/Crop.hs                 |   2 +-
 src/Grenade/Layers/Dropout.hs              |   4 +-
 src/Grenade/Layers/Flatten.hs              |   4 +-
 src/Grenade/Layers/FullyConnected.hs       |   2 +-
 src/Grenade/Layers/Fuse.hs                 |   6 +-
 src/Grenade/Layers/Logit.hs                |   6 +-
 src/Grenade/Layers/Pad.hs                  |   2 +-
 src/Grenade/Layers/Pooling.hs              |   6 +-
 src/Grenade/Layers/Relu.hs                 |   6 +-
 src/Grenade/Layers/Tanh.hs                 |   6 +-
 test/Test/Grenade/Layers/Convolution.hs    |  29 ++-
 16 files changed, 314 insertions(+), 107 deletions(-)
 create mode 100644 src/Grenade/Layers/Convolution/Internal.hs

diff --git a/grenade.cabal b/grenade.cabal
index 9acf065..8294132 100644
--- a/grenade.cabal
+++ b/grenade.cabal
@@ -40,6 +40,7 @@ library
                        Grenade.Core.Shape
                        Grenade.Layers.Crop
                        Grenade.Layers.Convolution
+                       Grenade.Layers.Convolution.Internal
                        Grenade.Layers.Dropout
                        Grenade.Layers.FullyConnected
                        Grenade.Layers.Flatten
diff --git a/src/Grenade/Core/Network.hs b/src/Grenade/Core/Network.hs
index 1f5ea00..fd6d68a 100644
--- a/src/Grenade/Core/Network.hs
+++ b/src/Grenade/Core/Network.hs
@@ -51,7 +51,7 @@ class UpdateLayer x => Layer x (i :: Shape) (o :: Shape) where
   --   layer gave from the input and the back propagated derivatives from
   --   the layer above.
   --   Returns the gradient layer and the derivatives to push back further.
-  runBackards    :: x -> S' i -> S' o -> (Gradient x, S' i)
+  runBackwards   :: x -> S' i -> S' o -> (Gradient x, S' i)
 
 -- | Type of a network.
 --   The [*] type specifies the types of the layers. This is needed for parallel
diff --git a/src/Grenade/Core/Runner.hs b/src/Grenade/Core/Runner.hs
index d3f1dd6..1cfd5cb 100644
--- a/src/Grenade/Core/Runner.hs
+++ b/src/Grenade/Core/Runner.hs
@@ -32,7 +32,7 @@ backPropagate network input target =
               -- recursively run the rest of the network, and get the gradients from above.
               (n', dWs')    = go y n
               -- calculate the gradient for this layer to pass down,
-              (layer', dWs) = runBackards layer x dWs'
+              (layer', dWs) = runBackwards layer x dWs'
 
           in (layer' :/> n', dWs)
 
@@ -40,7 +40,7 @@ backPropagate network input target =
     go !x (O layer)
         = let y                 = runForwards layer x
             -- the gradient (how much y affects the error)
-              (layer', dWs)     = runBackards layer x (y - target)
+              (layer', dWs)     = runBackwards layer x (y - target)
 
           in (OG layer', dWs)
 
diff --git a/src/Grenade/Layers/Convolution.hs b/src/Grenade/Layers/Convolution.hs
index ae3cd26..c7e640f 100644
--- a/src/Grenade/Layers/Convolution.hs
+++ b/src/Grenade/Layers/Convolution.hs
@@ -16,26 +16,22 @@ module Grenade.Layers.Convolution (
     Convolution (..)
   , Convolution' (..)
   , randomConvolution
-  , im2col
-  , vid2col
-  , col2im
-  , col2vid
-  , fittingStarts
   ) where
 
-import           Control.Monad.Random hiding (fromList)
+import           Control.Monad.Random hiding ( fromList )
 import           Data.Maybe
 import           Data.Proxy
 import           Data.Singletons.TypeLits
 import           GHC.TypeLits
 
-import           Numeric.LinearAlgebra hiding (uniformSample, konst)
+import           Numeric.LinearAlgebra hiding ( uniformSample, konst )
 import qualified Numeric.LinearAlgebra as LA
 import           Numeric.LinearAlgebra.Static hiding ((|||), build, toRows)
 
 import           Grenade.Core.Network
 import           Grenade.Core.Shape
 import           Grenade.Core.Vector
+import           Grenade.Layers.Convolution.Internal
 
 -- | A convolution layer for a neural network.
 --   This uses the im2col convolution trick popularised by Caffe, which essentially turns the
@@ -153,18 +149,21 @@ instance ( KnownNat kernelRows
   runForwards (Convolution kernel _) (S2D' input) =
     let ex = extract input
         ek = extract kernel
+        ix = fromIntegral $ natVal (Proxy :: Proxy inputRows)
+        iy = fromIntegral $ natVal (Proxy :: Proxy inputCols)
         kx = fromIntegral $ natVal (Proxy :: Proxy kernelRows)
         ky = fromIntegral $ natVal (Proxy :: Proxy kernelCols)
         sx = fromIntegral $ natVal (Proxy :: Proxy strideRows)
         sy = fromIntegral $ natVal (Proxy :: Proxy strideCols)
         ox = fromIntegral $ natVal (Proxy :: Proxy outputRows)
         oy = fromIntegral $ natVal (Proxy :: Proxy outputCols)
-        c  = im2col kx ky sx sy ex
+        c  = im2colUnsafe kx ky sx sy ix iy ex
         mt = c LA.<> ek
-        r  = col2vid 1 1 1 1 ox oy mt
+        r  = col2vidUnsafe 1 1 1 1 ox oy mt
         rs = fmap (fromJust . create) r
     in  S3D' $ mkVector rs
-  runBackards (Convolution kernel _) (S2D' input) (S3D' dEdy) =
+
+  runBackwards (Convolution kernel _) (S2D' input) (S3D' dEdy) =
     let ex = extract input
         ix = fromIntegral $ natVal (Proxy :: Proxy inputRows)
         iy = fromIntegral $ natVal (Proxy :: Proxy inputCols)
@@ -174,17 +173,19 @@ instance ( KnownNat kernelRows
         sy = fromIntegral $ natVal (Proxy :: Proxy strideCols)
         ox = fromIntegral $ natVal (Proxy :: Proxy outputRows)
         oy = fromIntegral $ natVal (Proxy :: Proxy outputCols)
-        c  = im2col kx ky sx sy ex
+        fl = fromIntegral $ natVal (Proxy :: Proxy filters)
+
+        c  = im2colUnsafe kx ky sx sy ix iy ex
 
         eo = vecToList $ fmap extract dEdy
         ek = extract kernel
 
-        vs = vid2col 1 1 1 1 ox oy eo
+        vs = vid2colUnsafe fl 1 1 1 1 ox oy eo
 
         kN = fromJust . create $ tr c LA.<> vs
         dW = vs LA.<> tr ek
 
-        xW = col2im kx ky sx sy ix iy dW
+        xW = col2imUnsafe kx ky sx sy ix iy dW
     in  (Convolution' kN, S2D' . fromJust . create $ xW)
 
 
@@ -215,12 +216,13 @@ instance ( KnownNat kernelRows
         sy = fromIntegral $ natVal (Proxy :: Proxy strideCols)
         ox = fromIntegral $ natVal (Proxy :: Proxy outputRows)
         oy = fromIntegral $ natVal (Proxy :: Proxy outputCols)
-        c  = vid2col kx ky sx sy ix iy ex
+        ch = fromIntegral $ natVal (Proxy :: Proxy channels)
+        c  = vid2colUnsafe ch kx ky sx sy ix iy ex
         mt = c LA.<> ek
-        r  = col2vid 1 1 1 1 ox oy mt
+        r  = col2vidUnsafe 1 1 1 1 ox oy mt
         rs = fmap (fromJust . create) r
     in  S3D' $ mkVector rs
-  runBackards (Convolution kernel _) (S3D' input) (S3D' dEdy) =
+  runBackwards (Convolution kernel _) (S3D' input) (S3D' dEdy) =
     let ex = vecToList $ fmap extract input
         ix = fromIntegral $ natVal (Proxy :: Proxy inputRows)
         iy = fromIntegral $ natVal (Proxy :: Proxy inputCols)
@@ -230,77 +232,18 @@ instance ( KnownNat kernelRows
         sy = fromIntegral $ natVal (Proxy :: Proxy strideCols)
         ox = fromIntegral $ natVal (Proxy :: Proxy outputRows)
         oy = fromIntegral $ natVal (Proxy :: Proxy outputCols)
-        c  = vid2col kx ky sx sy ix iy ex
+        ch = fromIntegral $ natVal (Proxy :: Proxy channels)
+        fl = fromIntegral $ natVal (Proxy :: Proxy filters)
+        c  = vid2colUnsafe ch kx ky sx sy ix iy ex
 
         eo = vecToList $ fmap extract dEdy
         ek = extract kernel
 
-        vs = vid2col 1 1 1 1 ox oy eo
+        vs = vid2colUnsafe fl 1 1 1 1 ox oy eo
 
         kN = fromJust . create $ tr c LA.<> vs
 
         dW = vs LA.<> tr ek
 
-        xW = col2vid kx ky sx sy ix iy dW
+        xW = col2vidUnsafe kx ky sx sy ix iy dW
     in  (Convolution' kN, S3D' . mkVector . fmap (fromJust . create) $ xW)
-
-im2col :: Int -> Int -> Int -> Int -> Matrix Double -> Matrix Double
-im2col nrows ncols srows scols m =
-  let starts = fittingStarts (rows m) nrows srows (cols m) ncols scols
-  in  im2colFit starts nrows ncols m
-
-im2colFit :: [(Int,Int)] -> Int -> Int -> Matrix Double -> Matrix Double
-im2colFit starts nrows ncols m =
-  let imRows = fmap (\start -> flatten $ subMatrix start (nrows, ncols) m) starts
-  in  fromRows imRows
-
-vid2col :: Int -> Int -> Int -> Int -> Int -> Int -> [Matrix Double] -> Matrix Double
-vid2col nrows ncols srows scols inputrows inputcols ms =
-  let starts = fittingStarts inputrows nrows srows inputcols ncols scols
-      subs   = fmap (im2colFit starts nrows ncols) ms
-  in  foldl1 (|||) subs
-
-col2vid :: Int -> Int -> Int -> Int -> Int -> Int -> Matrix Double -> [Matrix Double]
-col2vid nrows ncols srows scols drows dcols m =
-  let starts = fittingStart (cols m) (nrows * ncols) (nrows * ncols)
-      r      = rows m
-      mats   = fmap (\s -> subMatrix (0,s) (r, nrows * ncols) m) starts
-      colSts = fittingStarts drows nrows srows dcols ncols scols
-  in  fmap (col2imfit colSts nrows ncols drows dcols) mats
-
-col2im :: Int -> Int -> Int -> Int -> Int -> Int -> Matrix Double -> Matrix Double
-col2im krows kcols srows scols drows dcols m =
-  let starts     = fittingStarts drows krows srows dcols kcols scols
-  in  col2imfit starts krows kcols drows dcols m
-
-col2imfit :: [(Int,Int)] -> Int -> Int -> Int -> Int -> Matrix Double -> Matrix Double
-col2imfit starts krows kcols drows dcols m =
-  let indicies   = fmap (\[a,b] -> (a,b)) $ sequence [[0..(krows-1)], [0..(kcols-1)]]
-      convs      = fmap (zip indicies . toList) . toRows $ m
-      pairs      = zip convs starts
-      accums     = concat $ fmap (\(conv',(stx',sty')) -> fmap (\((ix,iy), val) -> ((ix + stx', iy + sty'), val)) conv') pairs
-  in  accum (LA.konst 0 (drows, dcols)) (+) accums
-
-
--- | These functions are not even remotely safe, but it's only called from the statically typed
---   commands, so we should be good ?!?!?
---   Returns the starting sub matrix locations which fit inside the larger matrix for the
---   convolution. Takes into account the stride and kernel size.
-fittingStarts :: Int -> Int -> Int -> Int -> Int -> Int -> [(Int,Int)]
-fittingStarts nrows kernelrows steprows ncols kernelcols stepcolsh =
-  let rs = fittingStart nrows kernelrows steprows
-      cs = fittingStart ncols kernelcols stepcolsh
-      ls = sequence [rs, cs]
-  in  fmap (\[a,b] -> (a,b)) ls
-
--- | Returns the starting sub vector which fit inside the larger vector for the
---   convolution. Takes into account the stride and kernel size.
-fittingStart :: Int -> Int -> Int -> [Int]
-fittingStart width kernel steps =
-  let go left | left + kernel < width
-              = left : go (left + steps)
-              | left + kernel == width
-              = left : []
-              | otherwise
-              = error "Kernel and step do not fit in matrix."
-  in  go 0
diff --git a/src/Grenade/Layers/Convolution/Internal.hs b/src/Grenade/Layers/Convolution/Internal.hs
new file mode 100644
index 0000000..f9a6e17
--- /dev/null
+++ b/src/Grenade/Layers/Convolution/Internal.hs
@@ -0,0 +1,238 @@
+module Grenade.Layers.Convolution.Internal (
+    im2col
+--   , im2colUnsafe
+  , vid2col
+  , col2im
+  , col2imFit
+  , col2vid
+
+  , col2vidUnsafe
+  , col2imUnsafe
+  , im2colUnsafe
+  , vid2colUnsafe
+  , fittingStarts
+  ) where
+
+import           Control.Monad.ST
+import           Control.Parallel.Strategies ( parMap, rseq )
+
+import           Data.STRef
+import           Data.Foldable ( forM_ )
+
+import           Numeric.LinearAlgebra hiding ( uniformSample, konst )
+import qualified Numeric.LinearAlgebra as LA
+import qualified Numeric.LinearAlgebra.Devel as U
+
+im2col :: Int -> Int -> Int -> Int -> Matrix Double -> Matrix Double
+im2col nrows ncols srows scols m =
+  let starts = fittingStarts (rows m) nrows srows (cols m) ncols scols
+  in  im2colFit starts nrows ncols m
+
+im2colFit :: [(Int,Int)] -> Int -> Int -> Matrix Double -> Matrix Double
+im2colFit starts nrows ncols m =
+  let imRows = fmap (\start -> flatten $ subMatrix start (nrows, ncols) m) starts
+  in  fromRows imRows
+
+vid2col :: Int -> Int -> Int -> Int -> Int -> Int -> [Matrix Double] -> Matrix Double
+vid2col nrows ncols srows scols inputrows inputcols ms =
+  let starts = fittingStarts inputrows nrows srows inputcols ncols scols
+      subs   = parMap rseq (im2colFit starts nrows ncols) ms
+  in  foldl1 (|||) subs
+
+col2vid :: Int -> Int -> Int -> Int -> Int -> Int -> Matrix Double -> [Matrix Double]
+col2vid krows kcols srows scols drows dcols m =
+  let starts = fittingStart (cols m) (krows * kcols) (krows * kcols)
+      r      = rows m
+      mats   = fmap (\s -> subMatrix (0,s) (r, krows * kcols) m) starts
+      colSts = fittingStarts drows krows srows dcols kcols scols
+  in  parMap rseq (col2imFit colSts krows kcols drows dcols) mats
+
+col2im :: Int -> Int -> Int -> Int -> Int -> Int -> Matrix Double -> Matrix Double
+col2im krows kcols srows scols drows dcols m =
+  let starts     = fittingStarts drows krows srows dcols kcols scols
+  in  col2imFit starts krows kcols drows dcols m
+
+-- | These functions are not even remotely safe, but it's only called from the statically typed
+--   commands, so we should be good ?!?!?
+--   Returns the starting sub matrix locations which fit inside the larger matrix for the
+--   convolution. Takes into account the stride and kernel size.
+fittingStarts :: Int -> Int -> Int -> Int -> Int -> Int -> [(Int,Int)]
+fittingStarts nrows kernelrows steprows ncols kernelcols stepcolsh =
+  let rs = fittingStart nrows kernelrows steprows
+      cs = fittingStart ncols kernelcols stepcolsh
+      ls = sequence [rs, cs]
+  in  fmap (\[a,b] -> (a,b)) ls
+
+-- | Returns the starting sub vector which fit inside the larger vector for the
+--   convolution. Takes into account the stride and kernel size.
+fittingStart :: Int -> Int -> Int -> [Int]
+fittingStart width kernel steps =
+  let go left | left + kernel < width
+              = left : go (left + steps)
+              | left + kernel == width
+              = [left]
+              | otherwise
+              = error "Kernel and step do not fit in matrix."
+  in  go 0
+
+col2imFit :: [(Int,Int)] -> Int -> Int -> Int -> Int -> Matrix Double -> Matrix Double
+col2imFit starts krows kcols drows dcols m =
+  let indicies = (\[a,b] -> (a,b)) <$> sequence [[0..(krows-1)], [0..(kcols-1)]]
+      convs      = fmap (zip indicies . toList) . toRows $ m
+      pairs      = zip convs starts
+      accums     = concatMap (\(conv',(stx',sty')) -> fmap (\((ix,iy), val) -> ((ix + stx', iy + sty'), val)) conv') pairs
+  in  accum (LA.konst 0 (drows, dcols)) (+) accums
+
+-- void col2im_cpu(const Dtype* data_col, const int channels,
+--     const int height, const int width, const int kernel_h, const int kernel_w,
+--     const int pad_h, const int pad_w,
+--     const int stride_h, const int stride_w,
+--     const int dilation_h, const int dilation_w,
+--     Dtype* data_im) {
+--   caffe_set(height * width * channels, Dtype(0), data_im);
+--   const int output_h = (height + 2 * pad_h -
+--     (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+--   const int output_w = (width + 2 * pad_w -
+--     (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+--   const int channel_size = height * width;
+--   for (int channel = channels; channel--; data_im += channel_size) {
+--     for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
+--       for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
+--         int input_row = -pad_h + kernel_row * dilation_h;
+--         for (int output_rows = output_h; output_rows; output_rows--) {
+--           if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
+--             data_col += output_w;
+--           } else {
+--             int input_col = -pad_w + kernel_col * dilation_w;
+--             for (int output_col = output_w; output_col; output_col--) {
+--               if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
+--                 data_im[input_row * width + input_col] += *data_col;
+--               }
+--               data_col++;
+--               input_col += stride_w;
+--             }
+--           }
+--           input_row += stride_h;
+--         }
+--       }
+--     }
+--   }
+-- }
+
+
+--   let starts = fittingStart (cols m) (krows * kcols) (krows * kcols)
+--       r      = rows m
+--       mats   = fmap (\s -> subMatrix (0,s) (r, krows * kcols) m) starts
+--   in  parMap rseq (col2imUnsafe krows kcols srows scols drows dcols) mats
+
+col2imUnsafe :: Int -> Int -> Int -> Int -> Int -> Int -> Matrix Double -> Matrix Double
+col2imUnsafe kernelRows kernelColumns strideRows strideColumns destinationRows destinationCols columnMatrix = U.runSTMatrix $ do
+  let columnMatrixRows = rows columnMatrix
+
+  dataIm  <- U.newMatrix 0 destinationRows destinationCols
+
+  offsetR <- newSTRef 0
+  offsetC <- newSTRef 0
+
+  forM_ [0 .. columnMatrixRows - 1] $ \ir -> do
+    inputColumn <- newSTRef 0
+    forM_ [0 .. kernelRows -1] $ \kr ->
+      forM_ [0 .. kernelColumns -1] $ \kc -> do
+        ic       <- readSTRef inputColumn
+        offsetR' <- readSTRef offsetR
+        offsetC' <- readSTRef offsetC
+        U.modifyMatrix dataIm (kr + offsetR') (kc + offsetC') (+ atIndex columnMatrix (ir,ic))
+        modifySTRef inputColumn (+1)
+
+    offsetC' <- readSTRef offsetC
+    if offsetC' + kernelColumns < destinationCols
+      then modifySTRef offsetC (+ strideColumns)
+      else writeSTRef offsetC 0 >> modifySTRef offsetR (+ strideRows)
+
+  return dataIm
+
+col2vidUnsafe :: Int -> Int -> Int -> Int -> Int -> Int -> Matrix Double -> [Matrix Double]
+col2vidUnsafe kernelRows kernelColumns strideRows strideColumns destinationRows destinationCols columnMatrix = runST $ do
+  let columnMatrixRows = rows columnMatrix
+  let filters = cols columnMatrix `div` (kernelRows * kernelColumns)
+
+  dataIms    <- traverse (\_ -> U.newMatrix 0 destinationRows destinationCols) [0 .. filters-1]
+
+  offsetR    <- newSTRef 0
+  offsetC    <- newSTRef 0
+  offsetM    <- newSTRef 0
+
+  forM_ dataIms $ \dataIm -> do
+    offsetM' <- readSTRef offsetM
+    forM_ [0 .. columnMatrixRows - 1] $ \ir -> do
+      inputColumn <- newSTRef 0
+      forM_ [0 .. kernelRows -1] $ \kr ->
+        forM_ [0 .. kernelColumns -1] $ \kc -> do
+          ic       <- readSTRef inputColumn
+          offsetR' <- readSTRef offsetR
+          offsetC' <- readSTRef offsetC
+          U.modifyMatrix dataIm (kr + offsetR') (kc + offsetC') (+ atIndex columnMatrix (ir, ic + offsetM'))
+          modifySTRef inputColumn (+1)
+
+      offsetC' <- readSTRef offsetC
+      if offsetC' + kernelColumns < destinationCols
+        then modifySTRef offsetC (+ strideColumns)
+        else writeSTRef offsetC 0 >> modifySTRef offsetR (+ strideRows)
+
+    writeSTRef offsetR 0
+    writeSTRef offsetC 0
+    modifySTRef offsetM (+ (kernelRows * kernelColumns))
+
+  traverse U.freezeMatrix dataIms
+
+vid2colUnsafe :: Int -> Int -> Int -> Int -> Int -> Int -> Int -> [Matrix Double] -> Matrix Double
+vid2colUnsafe channels kernelRows kernelColumns striderows stridecols vidrows vidcols dataVid = U.runSTMatrix $ do
+  let starts          = fittingStarts vidrows kernelRows striderows vidcols kernelColumns stridecols
+      matWidth        = kernelRows * kernelColumns
+      destinationRows = 1 + (vidrows - kernelRows) `div` striderows
+      destinationCols = 1 + (vidcols - kernelColumns) `div` stridecols
+      destinationSize = destinationRows * destinationCols
+
+  dataCol <- U.newMatrix 0 destinationSize (channels * matWidth)
+
+  offsetC <- newSTRef 0
+
+  forM_ dataVid $ \dataIm -> do
+    inputRow <- newSTRef 0
+    offsetC'     <- readSTRef offsetC
+    forM_ starts $ \(startRow, startCol) -> do
+      inputColumn <- newSTRef 0
+      inputRow'   <- readSTRef inputRow
+      forM_ [0 .. kernelRows -1] $ \kr ->
+        forM_ [0 .. kernelColumns -1] $ \kc -> do
+          inputColumn' <- readSTRef inputColumn
+          U.modifyMatrix dataCol inputRow' (inputColumn' + offsetC') (+ atIndex dataIm (kr + startRow, kc + startCol))
+          modifySTRef inputColumn (+1)
+      modifySTRef inputRow (+1)
+
+    modifySTRef offsetC (+ matWidth)
+
+  return dataCol
+
+im2colUnsafe :: Int -> Int -> Int -> Int -> Int -> Int -> Matrix Double -> Matrix Double
+im2colUnsafe kernelRows kernelColumns striderows stridecols vidrows vidcols dataIm = U.runSTMatrix $ do
+  let starts          = fittingStarts vidrows kernelRows striderows vidcols kernelColumns stridecols
+      matWidth        = kernelRows * kernelColumns
+      destinationRows = 1 + (vidrows - kernelRows) `div` striderows
+      destinationCols = 1 + (vidcols - kernelColumns) `div` stridecols
+      destinationSize = destinationRows * destinationCols
+
+  dataCol <- U.newMatrix 0 destinationSize matWidth
+
+  inputRow <- newSTRef 0
+  forM_ starts $ \(startRow, startCol) -> do
+    inputColumn <- newSTRef 0
+    inputRow'   <- readSTRef inputRow
+    forM_ [0 .. kernelRows -1] $ \kr ->
+      forM_ [0 .. kernelColumns -1] $ \kc -> do
+        inputColumn' <- readSTRef inputColumn
+        U.modifyMatrix dataCol inputRow' inputColumn' (+ atIndex dataIm (kr + startRow, kc + startCol))
+        modifySTRef inputColumn (+1)
+    modifySTRef inputRow (+1)
+
+  return dataCol
diff --git a/src/Grenade/Layers/Crop.hs b/src/Grenade/Layers/Crop.hs
index d7712db..2e5ae9c 100644
--- a/src/Grenade/Layers/Crop.hs
+++ b/src/Grenade/Layers/Crop.hs
@@ -58,7 +58,7 @@ instance ( KnownNat cropLeft
         m  = extract input
         r  = subMatrix (cropt, cropl) (nrows, ncols) m
     in  S2D' . fromJust . create $ r
-  runBackards _ _ (S2D' dEdy) =
+  runBackwards _ _ (S2D' dEdy) =
     let cropl = fromIntegral $ natVal (Proxy :: Proxy cropLeft)
         cropt = fromIntegral $ natVal (Proxy :: Proxy cropTop)
         cropr = fromIntegral $ natVal (Proxy :: Proxy cropRight)
diff --git a/src/Grenade/Layers/Dropout.hs b/src/Grenade/Layers/Dropout.hs
index 90b73e7..211f254 100644
--- a/src/Grenade/Layers/Dropout.hs
+++ b/src/Grenade/Layers/Dropout.hs
@@ -47,5 +47,5 @@ randomDropout rate = do
 instance (KnownNat i) => Layer (Dropout i) ('D1 i) ('D1 i) where
   runForwards (Dropout drops) (S1D' x) = S1D' $ x * drops
   runForwards (Pass rate) (S1D' x)= S1D' $ dvmap (* (1 - rate)) x
-  runBackards (Dropout drops) _ (S1D' x) = ((),  S1D' $ x * drops)
-  runBackards (Pass rate) _ (S1D' x) = ((),  S1D' $  dvmap (* (1 - rate)) x)
+  runBackwards (Dropout drops) _ (S1D' x) = ((),  S1D' $ x * drops)
+  runBackwards (Pass rate) _ (S1D' x) = ((),  S1D' $  dvmap (* (1 - rate)) x)
diff --git a/src/Grenade/Layers/Flatten.hs b/src/Grenade/Layers/Flatten.hs
index be61f84..b63d387 100644
--- a/src/Grenade/Layers/Flatten.hs
+++ b/src/Grenade/Layers/Flatten.hs
@@ -33,11 +33,11 @@ instance UpdateLayer FlattenLayer where
 
 instance (KnownNat a, KnownNat x, KnownNat y, a ~ (x * y)) => Layer FlattenLayer ('D2 x y) ('D1 a) where
   runForwards _ (S2D' y)   = S1D' . fromList . toList . flatten . extract $ y
-  runBackards _ _ (S1D' y) = ((), S2D' . fromList . toList . unwrap $ y)
+  runBackwards _ _ (S1D' y) = ((), S2D' . fromList . toList . unwrap $ y)
 
 instance (KnownNat a, KnownNat x, KnownNat y, KnownNat z, a ~ (x * y * z)) => Layer FlattenLayer ('D3 x y z) ('D1 a) where
   runForwards _ (S3D' y)     = S1D' . raiseShapeError . create . vjoin . vecToList . fmap (flatten . extract) $ y
-  runBackards _ _ (S1D' o) =
+  runBackwards _ _ (S1D' o) =
     let x'     = fromIntegral $ natVal (Proxy :: Proxy x)
         y'     = fromIntegral $ natVal (Proxy :: Proxy y)
         z'     = fromIntegral $ natVal (Proxy :: Proxy z)
diff --git a/src/Grenade/Layers/FullyConnected.hs b/src/Grenade/Layers/FullyConnected.hs
index 131044a..13a8f05 100644
--- a/src/Grenade/Layers/FullyConnected.hs
+++ b/src/Grenade/Layers/FullyConnected.hs
@@ -52,7 +52,7 @@ instance (KnownNat i, KnownNat o) => Layer (FullyConnected i o) ('D1 i) ('D1 o)
   runForwards (FullyConnected wB _ wN _) (S1D' v) = S1D' (wB + wN #> v)
 
   -- Run a backpropogation step for a full connected layer.
-  runBackards (FullyConnected _ _ wN _) (S1D' x) (S1D' dEdy) =
+  runBackwards (FullyConnected _ _ wN _) (S1D' x) (S1D' dEdy) =
           let wB'  = dEdy
               mm'  = dEdy `outer` x
               -- calcluate derivatives for next step
diff --git a/src/Grenade/Layers/Fuse.hs b/src/Grenade/Layers/Fuse.hs
index 7f12252..9ff7559 100644
--- a/src/Grenade/Layers/Fuse.hs
+++ b/src/Grenade/Layers/Fuse.hs
@@ -45,8 +45,8 @@ instance (Layer x i h, Layer y h o) => Layer (Fuse x y i h o) i o where
     let yInput  :: S' h = runForwards x input
     in runForwards y yInput
 
-  runBackards (x :$$ y) input backGradient =
+  runBackwards (x :$$ y) input backGradient =
     let yInput  :: S' h = runForwards x input
-        (y', yGrad)     = runBackards y yInput backGradient
-        (x', xGrad)     = runBackards x input yGrad
+        (y', yGrad)     = runBackwards y yInput backGradient
+        (x', xGrad)     = runBackwards x input yGrad
     in ((x', y'), xGrad)
diff --git a/src/Grenade/Layers/Logit.hs b/src/Grenade/Layers/Logit.hs
index 05d3c4c..6c6c399 100644
--- a/src/Grenade/Layers/Logit.hs
+++ b/src/Grenade/Layers/Logit.hs
@@ -29,15 +29,15 @@ instance UpdateLayer Logit where
 
 instance (KnownNat i) => Layer Logit ('D1 i) ('D1 i) where
   runForwards _ (S1D' y) = S1D' (logistic y)
-  runBackards _ (S1D' y) (S1D' dEdy) = ((), S1D' (logistic' y * dEdy))
+  runBackwards _ (S1D' y) (S1D' dEdy) = ((), S1D' (logistic' y * dEdy))
 
 instance (KnownNat i, KnownNat j) => Layer Logit ('D2 i j) ('D2 i j) where
   runForwards _ (S2D' y) = S2D' (logistic y)
-  runBackards _ (S2D' y) (S2D' dEdy) = ((), S2D' (logistic' y * dEdy))
+  runBackwards _ (S2D' y) (S2D' dEdy) = ((), S2D' (logistic' y * dEdy))
 
 instance (KnownNat i, KnownNat j, KnownNat k) => Layer Logit ('D3 i j k) ('D3 i j k) where
   runForwards _ (S3D' y) =  S3D' (fmap logistic y)
-  runBackards _ (S3D' y) (S3D' dEdy) = ((), S3D' (vectorZip (\y' dEdy' -> logistic' y' * dEdy') y dEdy))
+  runBackwards _ (S3D' y) (S3D' dEdy) = ((), S3D' (vectorZip (\y' dEdy' -> logistic' y' * dEdy') y dEdy))
 
 
 logistic :: Floating a => a -> a
diff --git a/src/Grenade/Layers/Pad.hs b/src/Grenade/Layers/Pad.hs
index 7540d1c..0969e39 100644
--- a/src/Grenade/Layers/Pad.hs
+++ b/src/Grenade/Layers/Pad.hs
@@ -58,7 +58,7 @@ instance ( KnownNat padLeft
         m     = extract input
         r     = diagBlock [konst 0 (padt,padl), m, konst 0 (padb,padr)]
     in  S2D' . fromJust . create $ r
-  runBackards Pad _ (S2D' dEdy) =
+  runBackwards Pad _ (S2D' dEdy) =
     let padl  = fromIntegral $ natVal (Proxy :: Proxy padLeft)
         padt  = fromIntegral $ natVal (Proxy :: Proxy padTop)
         nrows = fromIntegral $ natVal (Proxy :: Proxy inputRows)
diff --git a/src/Grenade/Layers/Pooling.hs b/src/Grenade/Layers/Pooling.hs
index c83feba..e133b3f 100644
--- a/src/Grenade/Layers/Pooling.hs
+++ b/src/Grenade/Layers/Pooling.hs
@@ -24,7 +24,7 @@ import           GHC.TypeLits
 import           Grenade.Core.Network
 import           Grenade.Core.Shape
 import           Grenade.Core.Vector
-import           Grenade.Layers.Convolution
+import           Grenade.Layers.Convolution.Internal
 
 import           Numeric.LinearAlgebra hiding (uniformSample)
 import qualified Numeric.LinearAlgebra as LA
@@ -75,7 +75,7 @@ instance ( KnownNat kernelRows
         r  = poolForward kx ky sx sy ox oy $ ex
         rs = fromJust . create $ r
     in  S2D' $ rs
-  runBackards Pooling (S2D' input) (S2D' dEdy) =
+  runBackwards Pooling (S2D' input) (S2D' dEdy) =
     let kx = fromIntegral $ natVal (Proxy :: Proxy kernelRows)
         ky = fromIntegral $ natVal (Proxy :: Proxy kernelColumns)
         sx = fromIntegral $ natVal (Proxy :: Proxy strideRows)
@@ -111,7 +111,7 @@ instance ( KnownNat kernelRows
         r  = poolForwardList kx ky sx sy ix iy ox oy ex
         rs = fmap (fromJust . create) r
     in  S3D' rs
-  runBackards Pooling (S3D' input) (S3D' dEdy) =
+  runBackwards Pooling (S3D' input) (S3D' dEdy) =
     let ix = fromIntegral $ natVal (Proxy :: Proxy inputRows)
         iy = fromIntegral $ natVal (Proxy :: Proxy inputColumns)
         kx = fromIntegral $ natVal (Proxy :: Proxy kernelRows)
diff --git a/src/Grenade/Layers/Relu.hs b/src/Grenade/Layers/Relu.hs
index 93b546d..e3e368e 100644
--- a/src/Grenade/Layers/Relu.hs
+++ b/src/Grenade/Layers/Relu.hs
@@ -31,7 +31,7 @@ instance ( KnownNat i) => Layer Relu ('D1 i) ('D1 i) where
   runForwards _ (S1D' y) = S1D' (relu y)
     where
       relu = LAS.dvmap (\a -> if a <= 0 then 0 else a)
-  runBackards _ (S1D' y) (S1D' dEdy) = ((), S1D' (relu' y * dEdy))
+  runBackwards _ (S1D' y) (S1D' dEdy) = ((), S1D' (relu' y * dEdy))
     where
       relu' = LAS.dvmap (\a -> if a <= 0 then 0 else 1)
 
@@ -39,7 +39,7 @@ instance (KnownNat i, KnownNat j) => Layer Relu ('D2 i j) ('D2 i j) where
   runForwards _ (S2D' y) = S2D' (relu y)
     where
       relu = LAS.dmmap (\a -> if a <= 0 then 0 else a)
-  runBackards _ (S2D' y) (S2D' dEdy) = ((), S2D' (relu' y * dEdy))
+  runBackwards _ (S2D' y) (S2D' dEdy) = ((), S2D' (relu' y * dEdy))
     where
       relu' = LAS.dmmap (\a -> if a <= 0 then 0 else 1)
 
@@ -47,6 +47,6 @@ instance (KnownNat i, KnownNat j, KnownNat k) => Layer Relu ('D3 i j k) ('D3 i j
   runForwards _ (S3D' y) = S3D' (fmap relu y)
     where
       relu = LAS.dmmap (\a -> if a <= 0 then 0 else a)
-  runBackards _ (S3D' y) (S3D' dEdy) = ((), S3D' (vectorZip (\y' dEdy' -> relu' y' * dEdy') y dEdy))
+  runBackwards _ (S3D' y) (S3D' dEdy) = ((), S3D' (vectorZip (\y' dEdy' -> relu' y' * dEdy') y dEdy))
     where
       relu' = LAS.dmmap (\a -> if a <= 0 then 0 else 1)
diff --git a/src/Grenade/Layers/Tanh.hs b/src/Grenade/Layers/Tanh.hs
index 8126e48..610fd6c 100644
--- a/src/Grenade/Layers/Tanh.hs
+++ b/src/Grenade/Layers/Tanh.hs
@@ -26,15 +26,15 @@ instance UpdateLayer Tanh where
 
 instance KnownNat i => Layer Tanh ('D1 i) ('D1 i) where
   runForwards _ (S1D' y) = S1D' (tanh y)
-  runBackards _ (S1D' y) (S1D' dEdy) = ((), S1D' (tanh' y * dEdy))
+  runBackwards _ (S1D' y) (S1D' dEdy) = ((), S1D' (tanh' y * dEdy))
 
 instance (KnownNat i, KnownNat j) => Layer Tanh ('D2 i j) ('D2 i j) where
   runForwards _ (S2D' y) =  S2D' (tanh y)
-  runBackards _ (S2D' y) (S2D' dEdy) = ((), S2D' (tanh' y * dEdy))
+  runBackwards _ (S2D' y) (S2D' dEdy) = ((), S2D' (tanh' y * dEdy))
 
 instance (KnownNat i, KnownNat j, KnownNat k) => Layer Tanh ('D3 i j k) ('D3 i j k) where
   runForwards _ (S3D' y) = S3D' (fmap tanh y)
-  runBackards _ (S3D' y) (S3D' dEdy) = ((), S3D' (vectorZip (\y' dEdy' -> tanh' y' * dEdy') y dEdy))
+  runBackwards _ (S3D' y) (S3D' dEdy) = ((), S3D' (vectorZip (\y' dEdy' -> tanh' y' * dEdy') y dEdy))
 
 tanh' :: (Floating a) => a -> a
 tanh' t = 1 - s ^ (2 :: Int)  where s = tanh t
diff --git a/test/Test/Grenade/Layers/Convolution.hs b/test/Test/Grenade/Layers/Convolution.hs
index cff38a9..65ecbc6 100644
--- a/test/Test/Grenade/Layers/Convolution.hs
+++ b/test/Test/Grenade/Layers/Convolution.hs
@@ -8,6 +8,7 @@ import           Grenade.Core.Shape
 import           Grenade.Core.Vector as Grenade
 import           Grenade.Core.Network
 import           Grenade.Layers.Convolution
+import           Grenade.Layers.Convolution.Internal
 
 import           Numeric.LinearAlgebra hiding (uniformSample, konst, (===))
 import qualified Numeric.LinearAlgebra.Static as HStatic
@@ -63,6 +64,17 @@ prop_im2col_sym_on_same_stride = once $
      out = col2im 3 2 3 2 3 4 . im2col 3 2 3 2 $ input
  in input === out
 
+-- If there's no overlap (stride is the same size as the kernel)
+-- then col2im . im2col should be symmetric.
+prop_im2colunsafe_sym_on_same_stride = once $
+ let input = (3><4)
+               [ 1.0,  2.0,  3.0,  4.0
+               , 5.0,  6.0,  7.0,  8.0
+               , 9.0, 10.0, 11.0, 12.0 ]
+     out = col2imUnsafe 3 2 3 2 3 4 . im2colUnsafe 3 2 3 2 3 4 $ input
+ in input === out
+
+
 -- If there is an overlap, then the gradient passed back should be
 -- the sum of the gradients across the filters.
 prop_im2col_col2im_additive = once $
@@ -127,7 +139,7 @@ prop_simple_conv_forwards = once $
       expectBack = (HStatic.matrix
                    [  1.0,  0.0, 0.0
                    ,  0.0, -2.0,-1.0] :: HStatic.L 2 3)
-      (nc, inX)  =  runBackards convLayer input grad
+      (nc, inX)  =  runBackwards convLayer input grad
 
   in case (out, inX, nc) of
     (S3D' out' , S2D' inX', Convolution' backGrad)
@@ -187,6 +199,19 @@ prop_vid2col_invert = once $
      out = col2vid 3 2 3 2 3 4 . vid2col 3 2 3 2 3 4 $ input
  in input === out
 
+prop_vid2col_invert_unsafe = once $
+ let input = [(3><4)
+               [ 1.0,  2.0,  3.0,  4.0
+               , 5.0,  6.0,  7.0,  8.0
+               , 9.0, 10.0, 11.0, 12.0 ]
+             , (3><4)
+               [ 21.0,  22.0,  23.0,  24.0
+               , 25.0,  26.0,  27.0,  28.0
+               , 29.0,  30.0,  31.0,  32.0 ] ]
+     out = col2vidUnsafe 3 2 3 2 3 4 . vid2colUnsafe 2 3 2 3 2 3 4 $ input
+ in input === out
+
+
 -- This test show that 2D convs act the same
 -- 3D convs with one layer
 prop_single_conv_forwards = once $
@@ -239,7 +264,7 @@ prop_single_conv_forwards = once $
       expectBack = (HStatic.matrix
                    [  1.0,  0.0, 0.0
                    ,  0.0, -2.0,-1.0] :: HStatic.L 2 3)
-      (nc, inX)  = runBackards convLayer input grad
+      (nc, inX)  = runBackwards convLayer input grad
 
   in case (out, inX, nc) of
     (S3D' out' , S3D' inX', Convolution' backGrad)