Merge pull request #11 from HuwCampbell/topic/learning-separation

Topic/learning separation
2024-11-22 06:55:13 +03:00 · 2016-12-04 11:19:24 +11:00 · 2016-12-04 11:19:24 +11:00 · 70fbac3924
commit 70fbac3924
parent be61f9b762 dfbb6c17b8
23 changed files with 302 additions and 240 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -6,7 +6,7 @@
 	url = https://github.com/ambiata/x.git
 [submodule "lib/p"]
 	path = lib/p
-	url = https://github.com/ambiata/p.git
+	url = https://github.com/ambiata/p
 [submodule "lib/disorder.hs"]
 	path = lib/disorder.hs
 	url = https://github.com/ambiata/disorder.hs.git
--- a/README.md
+++ b/README.md
@ -13,10 +13,10 @@ Five is right out.
 Grenade is a dependently typed, practical, and pretty quick neural network library for concise and precise
 specifications of complex networks in Haskell.

-As an example, a network which can achieve less than 1.5% error on mnist can be specified and
+As an example, a network which can achieve less than 1.5% error on MNIST can be specified and
 initialised with random weights in under 10 lines of code with
 ```haskell
-randomMnistNet :: MonadRandom m => m (Network Identity '[('D2 28 28), ('D3 24 24 10), ('D3 12 12 10), ('D3 12 12 10), ('D3 8 8 16), ('D3 4 4 16), ('D1 256), ('D1 256), ('D1 80), ('D1 80), ('D1 10), ('D1 10)])
+randomMnistNet :: MonadRandom m => m (Network '[ 'D2 28 28, 'D3 24 24 10, 'D3 12 12 10, 'D3 12 12 10, 'D3 8 8 16, 'D3 4 4 16, 'D1 256, 'D1 256, 'D1 80, 'D1 80, 'D1 10, 'D1 10])
 randomMnistNet = do
  a :: Convolution 1 10 5 5 1 1  <- randomConvolution
  let b :: Pooling 2 2 2 2        = Pooling
--- a/grenade.cabal
+++ b/grenade.cabal
@ -14,6 +14,7 @@ library
  build-depends:
                       base                            >= 4.8         && < 5
                     , bytestring                      == 0.10.*
+                     , async
                     , either                          == 4.4.*
                     , exceptions                      == 0.8.*
                     , hmatrix
--- a/lib/p
+++ b/lib/p
@ -0,0 +1 @@
+Subproject commit 3ea83a82b058ba2e1dd216d9e7832fd49cf33dbd
--- a/main/feedforward.hs
+++ b/main/feedforward.hs
@ -1,6 +1,5 @@
 {-# LANGUAGE BangPatterns          #-}
 {-# LANGUAGE DataKinds             #-}
-{-# LANGUAGE KindSignatures        #-}
 {-# LANGUAGE ScopedTypeVariables   #-}
 {-# LANGUAGE TypeOperators         #-}
 {-# LANGUAGE TupleSections         #-}
@ -8,7 +7,6 @@
 {-# LANGUAGE FlexibleContexts      #-}

 import           Control.Monad
-import           Control.Monad.Identity
 import           Control.Monad.Random

 import           GHC.TypeLits
@ -28,32 +26,30 @@ import           Grenade
 -- between the shapes, so inference can't do it all for us.

 -- With around 100000 examples, this should show two clear circles which have been learned by the network.
-randomNet :: (MonadRandom m)  => m (Network Identity '[('D1 2), ('D1 40), ('D1 40), ('D1 10), ('D1 10), ('D1 1), ('D1 1)])
+randomNet :: (MonadRandom m)  => m (Network '[ 'D1 2, 'D1 40, 'D1 40, 'D1 10, 'D1 10, 'D1 1, 'D1 1])
 randomNet = do
  a :: FullyConnected 2 40  <- randomFullyConnected
  b :: FullyConnected 40 10 <- randomFullyConnected
  c :: FullyConnected 10 1  <- randomFullyConnected
  return $ a :~> Tanh :~> b :~> Relu :~> c :~> O Logit

-netTest :: MonadRandom m => Double -> Int -> m String
+netTest :: MonadRandom m => LearningParameters -> Int -> m String
 netTest rate n = do
    inps <- replicateM n $ do
-      s <- getRandom
+      s  <- getRandom
      return $ S1D' $ SA.randomVector s SA.Uniform * 2 - 1
    let outs = flip map inps $ \(S1D' v) ->
-                 if v `inCircle` (fromRational 0.33, 0.33)
-                      || v `inCircle` (fromRational (-0.33), 0.33)
+                 if v `inCircle` (fromRational 0.33, 0.33)  || v `inCircle` (fromRational (-0.33), 0.33)
                   then S1D' $ fromRational 1
                   else S1D' $ fromRational 0
    net0 <- randomNet

-    return . runIdentity $ do
-      trained <- foldM trainEach net0 (zip inps outs)
-      let testIns = [ [ (x,y)  | x <- [0..50] ]
-                               | y <- [0..20] ]
+    let trained = foldl trainEach net0 (zip inps outs)
+    let testIns = [ [ (x,y)  | x <- [0..50] ]
+                              | y <- [0..20] ]

-      outMat <- traverse (traverse (\(x,y) -> (render . normx) <$> runNet trained (S1D' $ SA.vector [x / 25 - 1,y / 10 - 1]))) testIns
-      return $ unlines outMat
+    let outMat  = fmap (fmap (\(x,y) -> (render . normx) $ runNet trained (S1D' $ SA.vector [x / 25 - 1,y / 10 - 1]))) testIns
+    return $ unlines outMat

  where
    inCircle :: KnownNat n => SA.R n -> (SA.R n, Double) -> Bool
@ -70,11 +66,16 @@ netTest rate n = do
    normx (S1D' r) = SA.mean r


-data FeedForwardOpts = FeedForwardOpts Int Double
+data FeedForwardOpts = FeedForwardOpts Int LearningParameters

 feedForward' :: Parser FeedForwardOpts
-feedForward' = FeedForwardOpts <$> option auto (long "examples" <> short 'e' <> value 1000000)
-                               <*> option auto (long "train_rate" <> short 'r' <> value 0.01)
+feedForward' =
+  FeedForwardOpts <$> option auto (long "examples" <> short 'e' <> value 100000)
+                  <*> (LearningParameters
+                      <$> option auto (long "train_rate" <> short 'r' <> value 0.01)
+                      <*> option auto (long "momentum" <> value 0.9)
+                      <*> option auto (long "l2" <> value 0.0001)
+                      )

 main :: IO ()
 main = do
--- a/main/mnist.hs
+++ b/main/mnist.hs
@ -1,6 +1,5 @@
 {-# LANGUAGE BangPatterns          #-}
 {-# LANGUAGE DataKinds             #-}
-{-# LANGUAGE KindSignatures        #-}
 {-# LANGUAGE ScopedTypeVariables   #-}
 {-# LANGUAGE TypeOperators         #-}
 {-# LANGUAGE TupleSections         #-}
@ -9,7 +8,6 @@

 import           Control.Applicative
 import           Control.Monad
-import           Control.Monad.Identity
 import           Control.Monad.Random

 import qualified Data.Attoparsec.Text as A
@ -32,18 +30,17 @@ import           Grenade

 -- With the mnist data from Kaggle normalised to doubles between 0 and 1, learning rate of 0.01 and 15 iterations,
 -- this network should get down to about a 1.3% error rate.
-randomMnistNet :: (MonadRandom m) => m (Network Identity '[('D2 28 28), ('D2 32 32), ('D3 28 28 10), ('D3 14 14 10), ('D3 14 14 10), ('D3 10 10 16), ('D3 5 5 16), ('D1 400), ('D1 400), ('D1 80), ('D1 80), ('D1 10), ('D1 10)])
+randomMnistNet :: MonadRandom m => m (Network '[ 'D2 28 28, 'D3 24 24 10, 'D3 12 12 10, 'D3 12 12 10, 'D3 8 8 16, 'D3 4 4 16, 'D1 256, 'D1 256, 'D1 80, 'D1 80, 'D1 10, 'D1 10])
 randomMnistNet = do
-  let pad :: Pad 2 2 2 2          = Pad
  a :: Convolution 1 10 5 5 1 1  <- randomConvolution
  let b :: Pooling 2 2 2 2        = Pooling
  c :: Convolution 10 16 5 5 1 1 <- randomConvolution
  let d :: Pooling 2 2 2 2        = Pooling
-  e :: FullyConnected 400 80     <- randomFullyConnected
+  e :: FullyConnected 256 80     <- randomFullyConnected
  f :: FullyConnected 80  10     <- randomFullyConnected
-  return $ pad :~> a :~> b :~> Relu :~> c :~> d :~> FlattenLayer :~> Relu :~> e :~> Logit :~> f :~> O Logit
+  return $ a :~> b :~> Relu :~> c :~> d :~> FlattenLayer :~> Relu :~> e :~> Logit :~> f :~> O Logit

-convTest :: Int -> FilePath -> FilePath -> Double -> IO ()
+convTest :: Int -> FilePath -> FilePath -> LearningParameters -> IO ()
 convTest iterations trainFile validateFile rate = do
  net0 <- evalRandIO randomMnistNet
  fT   <- T.readFile trainFile
@ -52,7 +49,7 @@ convTest iterations trainFile validateFile rate = do
  let validateRows = traverse (A.parseOnly p) (T.lines fV)
  case (trainRows, validateRows) of
    (Right tr', Right vr') -> foldM_ (runIteration tr' vr') net0 [1..iterations]
-    err                    -> putStrLn $ show err
+    err                    -> print err

  where
    trainEach !rate' !nt !(i, o) = train rate' i o nt
@ -65,20 +62,24 @@ convTest iterations trainFile validateFile rate = do
      return (S2D' $ SA.fromList pixels, S1D' $ SA.fromList lab')

    runIteration trainRows validateRows net i = do
-      let trained' = runIdentity $ foldM (trainEach (rate * (0.9 ^ i))) net trainRows
-      let res      = runIdentity $ traverse (\(rowP,rowL) -> (rowL,) <$> runNet trained' rowP) validateRows
+      let trained' = foldl (trainEach rate) net trainRows
+      let res      = fmap (\(rowP,rowL) -> (rowL,) $ runNet trained' rowP) validateRows
      let res'     = fmap (\(S1D' label, S1D' prediction) -> (maxIndex (SA.extract label), maxIndex (SA.extract prediction))) res
-      putStrLn $ show trained'
+      print trained'
      putStrLn $ "Iteration " ++ show i ++ ": " ++ show (length (filter ((==) <$> fst <*> snd) res')) ++ " of " ++ show (length res')
      return trained'

-data MnistOpts = MnistOpts FilePath FilePath Int Double
+data MnistOpts = MnistOpts FilePath FilePath Int LearningParameters

 mnist' :: Parser MnistOpts
-mnist' = MnistOpts <$> (argument str (metavar "TRAIN"))
-                   <*> (argument str (metavar "VALIDATE"))
+mnist' = MnistOpts <$> argument str (metavar "TRAIN")
+                   <*> argument str (metavar "VALIDATE")
                   <*> option auto (long "iterations" <> short 'i' <> value 15)
-                   <*> option auto (long "train_rate" <> short 'r' <> value 0.01)
+                   <*> (LearningParameters
+                       <$> option auto (long "train_rate" <> short 'r' <> value 0.01)
+                       <*> option auto (long "momentum" <> value 0.9)
+                       <*> option auto (long "l2" <> value 0.0001)
+                       )

 main :: IO ()
 main = do
--- a/src/Grenade.hs
+++ b/src/Grenade.hs
@ -1,16 +1,3 @@
-{-# LANGUAGE BangPatterns          #-}
-{-# LANGUAGE DataKinds             #-}
-{-# LANGUAGE GADTs                 #-}
-{-# LANGUAGE KindSignatures        #-}
-{-# LANGUAGE ScopedTypeVariables   #-}
-{-# LANGUAGE StandaloneDeriving    #-}
-{-# LANGUAGE TypeOperators         #-}
-{-# LANGUAGE TypeFamilies          #-}
-{-# LANGUAGE PolyKinds             #-}
-{-# LANGUAGE MultiParamTypeClasses #-}
-{-# LANGUAGE FlexibleContexts      #-}
-{-# LANGUAGE FlexibleInstances     #-}
-
 module Grenade (
    module X
  ) where
--- a/src/Grenade/Core/Network.hs
+++ b/src/Grenade/Core/Network.hs
@ -1,9 +1,7 @@
-{-# LANGUAGE BangPatterns          #-}
 {-# LANGUAGE DataKinds             #-}
 {-# LANGUAGE GADTs                 #-}
 {-# LANGUAGE KindSignatures        #-}
 {-# LANGUAGE ScopedTypeVariables   #-}
-{-# LANGUAGE StandaloneDeriving    #-}
 {-# LANGUAGE TypeOperators         #-}
 {-# LANGUAGE TypeFamilies          #-}
 {-# LANGUAGE PolyKinds             #-}
@ -14,36 +12,53 @@
 module Grenade.Core.Network (
    Layer (..)
  , Network (..)
+  , UpdateLayer (..)
+  , LearningParameters (..)
  ) where

 import           Grenade.Core.Shape

+data LearningParameters = LearningParameters {
+    learningRate :: Double
+  , learningMomentum :: Double
+  , learningRegulariser :: Double
+  } deriving (Eq, Show)
+
+-- | Class for updating a layer. All layers implement this, and it is
+--   shape independent.
+class UpdateLayer x where
+  -- | The type for the gradient for this layer.
+  --   Unit if there isn't a gradient to pass back.
+  type Gradient x :: *
+  -- | Update a layer with its gradient and learning parameters
+  runUpdate      :: LearningParameters -> x -> Gradient x -> x
+
 -- | Class for a layer. All layers implement this, however, they don't
 --   need to implement it for all shapes, only ones which are appropriate.
-class Layer (m :: * -> *) x (i :: Shape) (o :: Shape) where
+class UpdateLayer x => Layer x (i :: Shape) (o :: Shape) where
  -- | Used in training and scoring. Take the input from the previous
  --   layer, and give the output from this layer.
-  runForwards    :: x -> S' i -> m (S' o)
-  -- | Back propagate a step. Takes a learning rate (move from here?)
-  --   the current layer, the input that the layer gave from the input
-  --   and the back propagated derivatives from the layer above.
-  --   Returns the updated layer and the derivatives to push back further.
-  runBackards    :: Double -> x -> S' i -> S' o -> m (x, S' i)
+  runForwards    :: x -> S' i -> S' o
+  -- | Back propagate a step. Takes the current layer, the input that the
+  --   layer gave from the input and the back propagated derivatives from
+  --   the layer above.
+  --   Returns the gradient layer and the derivatives to push back further.
+  runBackards    :: x -> S' i -> S' o -> (Gradient x, S' i)

 -- | Type of a network.
 --   The [Shape] type specifies the shapes of data passed between the layers.
 --   Could be considered to be a heterogeneous list of layers which are able to
 --   transform the data shapes of the network.
-data Network :: (* -> *) -> [Shape] -> * where
-    O     :: (Show x, Layer m x i o, KnownShape o, KnownShape i)
+data Network :: [Shape] -> * where
+    O     :: (Show x, Layer x i o, KnownShape o, KnownShape i)
          => !x
-          -> Network m '[i, o]
-    (:~>) :: (Show x, Layer m x i h, KnownShape h, KnownShape i)
+          -> Network '[i, o]
+    (:~>) :: (Show x, Layer x i h, KnownShape h, KnownShape i)
          => !x
-          -> !(Network m (h ': hs))
-          -> Network m (i ': h ': hs)
+          -> !(Network (h ': hs))
+          -> Network (i ': h ': hs)
 infixr 5 :~>

-instance Show (Network m h) where
+instance Show (Network h) where
  show (O a) = "O " ++ show a
  show (i :~> o) = show i ++ "\n:~>\n" ++ show o
--- a/src/Grenade/Core/Runner.hs
+++ b/src/Grenade/Core/Runner.hs
@ -1,9 +1,7 @@
 {-# LANGUAGE BangPatterns          #-}
 {-# LANGUAGE GADTs                 #-}
 {-# LANGUAGE DataKinds             #-}
-{-# LANGUAGE KindSignatures        #-}
 {-# LANGUAGE ScopedTypeVariables   #-}
-{-# LANGUAGE StandaloneDeriving    #-}
 {-# LANGUAGE TypeOperators         #-}
 {-# LANGUAGE TypeFamilies          #-}

@ -17,39 +15,44 @@ import           Grenade.Core.Network
 import           Grenade.Core.Shape

 -- | Update a network with new weights after training with an instance.
-train :: forall m i o hs. (Monad m, Head hs ~ i, Last hs ~ o, KnownShape i, KnownShape o)
-      => Double               -- ^ learning rate
+train :: forall i o hs. (Head hs ~ i, Last hs ~ o, KnownShape i, KnownShape o)
+      => LearningParameters   -- ^ learning rate
      -> S' i                 -- ^ input vector
      -> S' o                 -- ^ target vector
-      -> Network m hs         -- ^ network to train
-      -> m (Network m hs)
-train rate x0 target = fmap fst . go x0
+      -> Network hs           -- ^ network to train
+      -> Network hs
+train rate x0 target = fst . go x0
  where
-    go  :: forall m' j js. (Monad m', Head js ~ j, Last js ~ o, KnownShape j)
+    go  :: forall j js. (Head js ~ j, Last js ~ o, KnownShape j)
        => S' j                -- ^ input vector
-        -> Network m' js       -- ^ network to train
-        -> m' (Network m' js, S' j)
+        -> Network js       -- ^ network to train
+        -> (Network js, S' j)
    -- handle input from the beginning, feeding upwards.
    go !x (layer :~> n)
-        = do y             <- runForwards layer x
+        = let y             = runForwards layer x
              -- run the rest of the network, and get the layer from above.
-             (n', dWs')    <- go y n
+              (n', dWs')    = go y n
              -- calculate the gradient for this layer to pass down,
-             (layer', dWs) <- runBackards rate layer x dWs'
-             return (layer' :~> n', dWs)
+              (layer', dWs) = runBackards layer x dWs'
+
+              -- Update this layer using the gradient
+              newLayer      = runUpdate rate layer layer'
+
+          in (newLayer :~> n', dWs)

    -- handle the output layer, bouncing the derivatives back down.
    go !x (O layer)
-        = do  y                 <- runForwards layer x
+        = let y                 = runForwards layer x
              -- the gradient (how much y affects the error)
-              (layer', dWs)     <- runBackards rate layer x (y - target)
-              return (O layer', dWs)
+              (layer', dWs)     = runBackards layer x (y - target)
+              newLayer          = runUpdate rate layer layer'
+
+          in (O newLayer, dWs)

 -- | Just forwards propagation with no training.
-runNet :: forall m hs. (Monad m)
-       => Network m hs
-       -> (S' (Head hs))         -- ^ input vector
-       -> m (S' (Last hs))     -- ^ target vector
-runNet (layer :~> n)  !x = do y <- runForwards layer x
-                              runNet n y
+runNet :: Network hs
+       -> S' (Head hs)         -- ^ input vector
+       -> S' (Last hs)         -- ^ target vector
+runNet (layer :~> n)  !x = let y = runForwards layer x
+                           in  runNet n y
 runNet (O layer)      !x = runForwards layer x
--- a/src/Grenade/Core/Shape.hs
+++ b/src/Grenade/Core/Shape.hs
@ -39,17 +39,17 @@ data Shape =
 instance KnownShape x => Num (S' x) where
  (+) (S1D' x) (S1D' y) = S1D' (x + y)
  (+) (S2D' x) (S2D' y) = S2D' (x + y)
-  (+) (S3D' x) (S3D' y) = S3D' (vectorZip (\x' y' -> x' + y') x y)
+  (+) (S3D' x) (S3D' y) = S3D' (vectorZip (+) x y)
  (+)  _ _ = error "Impossible to have different constructors for the same shaped network"

  (-) (S1D' x) (S1D' y) = S1D' (x - y)
  (-) (S2D' x) (S2D' y) = S2D' (x - y)
-  (-) (S3D' x) (S3D' y) = S3D' (vectorZip (\x' y' -> x' - y') x y)
+  (-) (S3D' x) (S3D' y) = S3D' (vectorZip (-) x y)
  (-)  _ _ = error "Impossible to have different constructors for the same shaped network"

  (*) (S1D' x) (S1D' y) = S1D' (x * y)
  (*) (S2D' x) (S2D' y) = S2D' (x * y)
-  (*) (S3D' x) (S3D' y) = S3D' (vectorZip (\x' y' -> x' * y') x y)
+  (*) (S3D' x) (S3D' y) = S3D' (vectorZip (*) x y)
  (*)  _ _ = error "Impossible to have different constructors for the same shaped network"

  abs (S1D' x) = S1D' (abs x)
--- a/src/Grenade/Core/Vector.hs
+++ b/src/Grenade/Core/Vector.hs
@ -26,7 +26,7 @@ instance Foldable (Vector n) where
  foldr f b (Vector as) = foldr f b as

 instance KnownNat n => Traversable (Vector n) where
-  traverse f (Vector as) = fmap mkVector $ traverse f as
+  traverse f (Vector as) = mkVector <$> traverse f as

 instance Functor (Vector n) where
  fmap f (Vector as) = Vector (fmap f as)
@ -41,7 +41,7 @@ mkVector :: forall n a. KnownNat n => [a] -> Vector n a
 mkVector as
 = let du = fromIntegral . natVal $ (undefined :: Proxy n)
       la = length as
-   in if (du == la)
+   in if du == la
        then Vector as
        else error $ "Error creating staticly sized Vector of length: " ++
                     show du ++ " list is of length:" ++ show la
--- a/src/Grenade/Layers/Convolution.hs
+++ b/src/Grenade/Layers/Convolution.hs
@ -2,6 +2,7 @@
 {-# LANGUAGE DataKinds             #-}
 {-# LANGUAGE ScopedTypeVariables   #-}
 {-# LANGUAGE StandaloneDeriving    #-}
+{-# LANGUAGE RecordWildCards       #-}
 {-# LANGUAGE GADTs                 #-}
 {-# LANGUAGE TypeOperators         #-}
 {-# LANGUAGE TypeFamilies          #-}
@ -13,6 +14,7 @@

 module Grenade.Layers.Convolution (
    Convolution (..)
+  , Convolution' (..)
  , randomConvolution
  , im2col
  , vid2col
@ -65,6 +67,24 @@ data Convolution :: Nat -- ^ Number of channels, for the first layer this could
              -> !(L kernelFlattened filters) -- ^ The last kernel update (or momentum)
              -> Convolution channels filters kernelRows kernelColumns strideRows strideColumns

+data Convolution' :: Nat -- ^ Number of channels, for the first layer this could be RGB for instance.
+                  -> Nat -- ^ Number of filters, this is the number of channels output by the layer.
+                  -> Nat -- ^ The number of rows in the kernel filter
+                  -> Nat -- ^ The number of column in the kernel filter
+                  -> Nat -- ^ The row stride of the convolution filter
+                  -> Nat -- ^ The columns stride of the convolution filter
+                  -> * where
+  Convolution' :: ( KnownNat channels
+                  , KnownNat filters
+                  , KnownNat kernelRows
+                  , KnownNat kernelColumns
+                  , KnownNat strideRows
+                  , KnownNat strideColumns
+                  , KnownNat kernelFlattened
+                  , kernelFlattened ~ (kernelRows * kernelColumns * channels))
+               => !(L kernelFlattened filters) -- ^ The kernel filter gradient
+               -> Convolution' channels filters kernelRows kernelColumns strideRows strideColumns
+
 instance Show (Convolution c f k k' s s') where
  show (Convolution a _) = renderConv a
    where
@ -99,9 +119,16 @@ randomConvolution = do
        mm = konst 0
    return $ Convolution wN mm

+instance UpdateLayer (Convolution channels filters kernelRows kernelCols strideRows strideCols) where
+  type Gradient (Convolution channels filters kernelRows kernelCols strideRows strideCols) = (Convolution' channels filters kernelRows kernelCols strideRows strideCols)
+  runUpdate LearningParameters {..} (Convolution oldKernel oldMomentum) (Convolution' kernelGradient) =
+    let newMomentum    = konst learningMomentum * oldMomentum - konst learningRate * kernelGradient
+        regulariser    = konst (learningRegulariser * learningRate) * oldKernel
+        newKernel      = oldKernel + newMomentum - regulariser
+    in Convolution newKernel newMomentum
+
 -- | A two dimentional image may have a convolution filter applied to it
-instance ( Monad m
-         , KnownNat kernelRows
+instance ( KnownNat kernelRows
         , KnownNat kernelCols
         , KnownNat filters
         , KnownNat strideRows
@ -112,7 +139,7 @@ instance ( Monad m
         , KnownNat outputCols
         , ((outputRows - 1) * strideRows) ~ (inputRows - kernelRows)
         , ((outputCols - 1) * strideCols) ~ (inputCols - kernelCols)
-         ) => Layer m (Convolution 1 filters kernelRows kernelCols strideRows strideCols) ('D2 inputRows inputCols) ('D3 outputRows outputCols filters) where
+         ) => Layer (Convolution 1 filters kernelRows kernelCols strideRows strideCols) ('D2 inputRows inputCols) ('D3 outputRows outputCols filters) where
  runForwards (Convolution kernel _) (S2D' input) =
    let ex = extract input
        ek = extract kernel
@ -126,8 +153,8 @@ instance ( Monad m
        mt = c LA.<> ek
        r  = col2vid 1 1 1 1 ox oy mt
        rs = fmap (fromJust . create) r
-    in  return . S3D' $ mkVector rs
-  runBackards rate (Convolution kernel momentum) (S2D' input) (S3D' dEdy) =
+    in  S3D' $ mkVector rs
+  runBackards (Convolution kernel _) (S2D' input) (S3D' dEdy) =
    let ex = extract input
        ix = fromIntegral $ natVal (Proxy :: Proxy inputRows)
        iy = fromIntegral $ natVal (Proxy :: Proxy inputCols)
@ -145,20 +172,15 @@ instance ( Monad m
        vs = vid2col 1 1 1 1 ox oy eo

        kN = fromJust . create $ tr c LA.<> vs
-        mm = momentum * 0.9 - konst rate * kN
-        wd = konst (0.0001 * rate) * kernel
-        rM = kernel + mm - wd
-
        dW = vs LA.<> tr ek

        xW = col2im kx ky sx sy ix iy dW
-    in  return (Convolution rM mm, S2D' . fromJust . create $ xW)
+    in  (Convolution' kN, S2D' . fromJust . create $ xW)


 -- | A three dimensional image (or 2d with many channels) can have
 --   an appropriately sized convolution filter run across it.
-instance ( Monad m
-         , KnownNat kernelRows
+instance ( KnownNat kernelRows
         , KnownNat kernelCols
         , KnownNat filters
         , KnownNat strideRows
@ -170,7 +192,7 @@ instance ( Monad m
         , KnownNat channels
         , ((outputRows - 1) * strideRows) ~ (inputRows - kernelRows)
         , ((outputCols - 1) * strideCols) ~ (inputCols - kernelCols)
-         ) => Layer m (Convolution channels filters kernelRows kernelCols strideRows strideCols) ('D3 inputRows inputCols channels) ('D3 outputRows outputCols filters) where
+         ) => Layer (Convolution channels filters kernelRows kernelCols strideRows strideCols) ('D3 inputRows inputCols channels) ('D3 outputRows outputCols filters) where
  runForwards (Convolution kernel _) (S3D' input) =
    let ex = vecToList $ fmap extract input
        ek = extract kernel
@ -186,8 +208,8 @@ instance ( Monad m
        mt = c LA.<> ek
        r  = col2vid 1 1 1 1 ox oy mt
        rs = fmap (fromJust . create) r
-    in  return . S3D' $ mkVector rs
-  runBackards rate (Convolution kernel momentum) (S3D' input) (S3D' dEdy) =
+    in  S3D' $ mkVector rs
+  runBackards (Convolution kernel _) (S3D' input) (S3D' dEdy) =
    let ex = vecToList $ fmap extract input
        ix = fromIntegral $ natVal (Proxy :: Proxy inputRows)
        iy = fromIntegral $ natVal (Proxy :: Proxy inputCols)
@ -205,14 +227,11 @@ instance ( Monad m
        vs = vid2col 1 1 1 1 ox oy eo

        kN = fromJust . create $ tr c LA.<> vs
-        mm = momentum * 0.9 - konst rate * kN
-        wd = konst (0.0005 * rate) * kernel
-        rM = kernel + mm - wd

        dW = vs LA.<> tr ek

        xW = col2vid kx ky sx sy ix iy dW
-    in  return (Convolution rM mm, S3D' . mkVector . fmap (fromJust . create) $ xW)
+    in  (Convolution' kN, S3D' . mkVector . fmap (fromJust . create) $ xW)

 im2col :: Int -> Int -> Int -> Int -> Matrix Double -> Matrix Double
 im2col nrows ncols srows scols m =
--- a/src/Grenade/Layers/Crop.hs
+++ b/src/Grenade/Layers/Crop.hs
@ -1,7 +1,5 @@
-{-# LANGUAGE BangPatterns          #-}
 {-# LANGUAGE DataKinds             #-}
 {-# LANGUAGE ScopedTypeVariables   #-}
-{-# LANGUAGE StandaloneDeriving    #-}
 {-# LANGUAGE GADTs                 #-}
 {-# LANGUAGE TypeOperators         #-}
 {-# LANGUAGE TypeFamilies          #-}
@ -39,9 +37,12 @@ data Crop :: Nat
 instance Show (Crop cropLeft cropTop cropRight cropBottom) where
  show Crop = "Crop"

+instance UpdateLayer (Crop l t r b) where
+  type Gradient (Crop l t r b) = ()
+  runUpdate _ x _ = x
+
 -- | A two dimentional image can be cropped.
-instance ( Monad m
-         , KnownNat cropLeft
+instance ( KnownNat cropLeft
         , KnownNat cropTop
         , KnownNat cropRight
         , KnownNat cropBottom
@ -51,7 +52,7 @@ instance ( Monad m
         , KnownNat outputColumns
         , (inputRows - cropTop - cropBottom) ~ outputRows
         , (inputColumns - cropLeft - cropRight) ~ outputColumns
-         ) => Layer m (Crop cropLeft cropTop cropRight cropBottom) ('D2 inputRows inputColumns) ('D2 outputRows outputColumns) where
+         ) => Layer (Crop cropLeft cropTop cropRight cropBottom) ('D2 inputRows inputColumns) ('D2 outputRows outputColumns) where
  runForwards Crop (S2D' input) =
    let cropl = fromIntegral $ natVal (Proxy :: Proxy cropLeft)
        cropt = fromIntegral $ natVal (Proxy :: Proxy cropTop)
@ -59,12 +60,12 @@ instance ( Monad m
        ncols = fromIntegral $ natVal (Proxy :: Proxy outputColumns)
        m  = extract input
        r  = subMatrix (cropt, cropl) (nrows, ncols) m
-    in  return . S2D' . fromJust . create $ r
-  runBackards _ crop _ (S2D' dEdy) =
+    in  S2D' . fromJust . create $ r
+  runBackards _ _ (S2D' dEdy) =
    let cropl = fromIntegral $ natVal (Proxy :: Proxy cropLeft)
        cropt = fromIntegral $ natVal (Proxy :: Proxy cropTop)
        cropr = fromIntegral $ natVal (Proxy :: Proxy cropRight)
        cropb = fromIntegral $ natVal (Proxy :: Proxy cropBottom)
        eo    = extract dEdy
        vs    = diagBlock [konst 0 (cropt,cropl), eo, konst 0 (cropb,cropr)]
-    in  return (crop, S2D' . fromJust . create $ vs)
+    in  ((), S2D' . fromJust . create $ vs)
--- a/src/Grenade/Layers/Dropout.hs
+++ b/src/Grenade/Layers/Dropout.hs
@ -1,7 +1,5 @@
-{-# LANGUAGE BangPatterns          #-}
 {-# LANGUAGE DataKinds             #-}
 {-# LANGUAGE ScopedTypeVariables   #-}
-{-# LANGUAGE StandaloneDeriving    #-}
 {-# LANGUAGE TypeOperators         #-}
 {-# LANGUAGE TypeFamilies          #-}
 {-# LANGUAGE MultiParamTypeClasses #-}
@ -11,15 +9,14 @@

 module Grenade.Layers.Dropout (
    Dropout (..)
+  , randomDropout
  ) where

 import           Control.Monad.Random hiding (fromList)
-import           Control.Monad.State

 import           GHC.TypeLits
 import           Grenade.Core.Shape
 import           Grenade.Core.Network
-import           Grenade.Core.Phase

 import           Numeric.LinearAlgebra.Static

@ -29,23 +26,25 @@ import           Numeric.LinearAlgebra.Static
 -- After backpropogation, we return a new matrix/vector, with different bits dropped out.
 -- Double is the proportion to drop in each training iteration (like 1% or 5% would be
 -- reasonable).
-data Dropout o = Dropout Double (R o)
+data Dropout o =
+  Dropout (R o)
+  | Pass Double
  deriving Show

+instance (KnownNat i) => UpdateLayer (Dropout i) where
+  type Gradient (Dropout i) = ()
+  runUpdate _ x _ = x
+
 randomDropout :: (MonadRandom m, KnownNat i)
              => Double -> m (Dropout i)
 randomDropout rate = do
    seed  <- getRandom
    let wN = randomVector seed Uniform
        xs = dvmap (\a -> if a <= rate then 0 else 1) wN
-    return $ Dropout rate xs
+    return $ Dropout xs

-instance (MonadRandom m, MonadState Phase m, KnownNat i) => Layer m (Dropout i) ('D1 i) ('D1 i) where
-  runForwards (Dropout rate drops) (S1D' x) = isTrainingPhase >>= \case
-    True  -> return . S1D' $ x * drops
-    False -> return . S1D' $ dvmap (* (1 - rate)) x
-  runBackards _ oldDropout@(Dropout rate drops) _ (S1D' x) = isTrainingPhase >>= \case
-    True -> do
-      newDropout <- randomDropout rate
-      return (newDropout,  S1D' $ x * drops)
-    False -> return (oldDropout,  S1D' $  dvmap (* (1 - rate)) x)
+instance (KnownNat i) => Layer (Dropout i) ('D1 i) ('D1 i) where
+  runForwards (Dropout drops) (S1D' x) = S1D' $ x * drops
+  runForwards (Pass rate) (S1D' x)= S1D' $ dvmap (* (1 - rate)) x
+  runBackards (Dropout drops) _ (S1D' x) = ((),  S1D' $ x * drops)
+  runBackards (Pass rate) _ (S1D' x) = ((),  S1D' $  dvmap (* (1 - rate)) x)
--- a/src/Grenade/Layers/Flatten.hs
+++ b/src/Grenade/Layers/Flatten.hs
@ -25,20 +25,24 @@ import           Grenade.Core.Network
 data FlattenLayer = FlattenLayer
  deriving Show

-instance (Monad m, KnownNat a, KnownNat x, KnownNat y, a ~ (x * y)) => Layer m FlattenLayer ('D2 x y) ('D1 a) where
-  runForwards _ (S2D' y)     = return $ S1D' . fromList . toList . flatten . extract $ y
-  runBackards _ _ _ (S1D' y) = return (FlattenLayer, S2D' . fromList . toList . unwrap $ y)
+instance UpdateLayer FlattenLayer where
+  type Gradient FlattenLayer = ()
+  runUpdate _ _ _ = FlattenLayer

-instance (Monad m, KnownNat a, KnownNat x, KnownNat y, KnownNat z, a ~ (x * y * z)) => Layer m FlattenLayer ('D3 x y z) ('D1 a) where
-  runForwards _ (S3D' y)     = return $ S1D' . raiseShapeError . create . vjoin . vecToList . fmap (flatten . extract) $ y
-  runBackards _ _ _ (S1D' o) = do
+instance (KnownNat a, KnownNat x, KnownNat y, a ~ (x * y)) => Layer FlattenLayer ('D2 x y) ('D1 a) where
+  runForwards _ (S2D' y)   = S1D' . fromList . toList . flatten . extract $ y
+  runBackards _ _ (S1D' y) = ((), S2D' . fromList . toList . unwrap $ y)
+
+instance (KnownNat a, KnownNat x, KnownNat y, KnownNat z, a ~ (x * y * z)) => Layer FlattenLayer ('D3 x y z) ('D1 a) where
+  runForwards _ (S3D' y)     = S1D' . raiseShapeError . create . vjoin . vecToList . fmap (flatten . extract) $ y
+  runBackards _ _ (S1D' o) =
    let x'     = fromIntegral $ natVal (Proxy :: Proxy x)
        y'     = fromIntegral $ natVal (Proxy :: Proxy y)
        z'     = fromIntegral $ natVal (Proxy :: Proxy z)
        vecs   = takesV (replicate z' (x' * y')) (extract o)
        ls     = fmap (raiseShapeError . create . reshape y') vecs
        ls'    = mkVector ls :: Vector z (L x y)
-    return (FlattenLayer, S3D' ls')
+    in ((), S3D' ls')

 raiseShapeError :: Maybe a -> a
 raiseShapeError (Just x) = x
--- a/src/Grenade/Layers/FullyConnected.hs
+++ b/src/Grenade/Layers/FullyConnected.hs
@ -1,7 +1,6 @@
-{-# LANGUAGE BangPatterns          #-}
 {-# LANGUAGE DataKinds             #-}
 {-# LANGUAGE ScopedTypeVariables   #-}
-{-# LANGUAGE StandaloneDeriving    #-}
+{-# LANGUAGE RecordWildCards       #-}
 {-# LANGUAGE TypeOperators         #-}
 {-# LANGUAGE TypeFamilies          #-}
 {-# LANGUAGE MultiParamTypeClasses #-}
@ -24,26 +23,39 @@ import           Grenade.Core.Shape
 -- | A basic fully connected (or inner product) neural network layer.
 data FullyConnected i o = FullyConnected
                        !(R o)   -- Bias neuron weights
+                        !(R o)   -- Bias neuron momentum
                        !(L o i) -- Activation weights
-                        !(L o i) -- Activation momentums
+                        !(L o i) -- Momentum
+
+data FullyConnected' i o = FullyConnected'
+                         !(R o)   -- Bias neuron gradient
+                         !(L o i) -- Activation gradient

 instance Show (FullyConnected i o) where
-  show (FullyConnected _ _ _) = "FullyConnected"
+  show FullyConnected {} = "FullyConnected"

-instance (Monad m, KnownNat i, KnownNat o) => Layer m (FullyConnected i o) ('D1 i) ('D1 o) where
+instance (KnownNat i, KnownNat o) => UpdateLayer (FullyConnected i o) where
+  type Gradient (FullyConnected i o) = (FullyConnected' i o)
+
+  runUpdate LearningParameters {..} (FullyConnected oldBias oldBiasMomentum oldActivations oldMomentum) (FullyConnected' biasGradient activationGradient) =
+    let newBiasMomentum = konst learningMomentum * oldBiasMomentum - konst learningRate * biasGradient
+        newBias         = oldBias + newBiasMomentum
+        newMomentum     = konst learningMomentum * oldMomentum - konst learningRate * activationGradient
+        regulariser     = konst (learningRegulariser * learningRate) * oldActivations
+        newActivations  = oldActivations + newMomentum - regulariser
+    in FullyConnected newBias newBiasMomentum newActivations newMomentum
+
+instance (KnownNat i, KnownNat o) => Layer (FullyConnected i o) ('D1 i) ('D1 o) where
  -- Do a matrix vector multiplication and return the result.
-  runForwards (FullyConnected wB wN _) (S1D' v) = return $ S1D' (wB + wN #> v)
+  runForwards (FullyConnected wB _ wN _) (S1D' v) = S1D' (wB + wN #> v)

  -- Run a backpropogation step for a full connected layer.
-  runBackards rate (FullyConnected wB wN mm) (S1D' x) (S1D' dEdy) =
-          let wB'  = wB - konst rate * dEdy
-              mm'  = 0.9 * mm - konst rate * (dEdy `outer` x)
-              wd'  = konst (0.0001 * rate) * wN
-              wN'  = wN + mm' - wd'
-              w'   = FullyConnected wB' wN' mm'
+  runBackards (FullyConnected _ _ wN _) (S1D' x) (S1D' dEdy) =
+          let wB'  = dEdy
+              mm'  = dEdy `outer` x
              -- calcluate derivatives for next step
              dWs  = tr wN #> dEdy
-          in  return (w', S1D' dWs)
+          in  (FullyConnected' wB' mm', S1D' dWs)

 randomFullyConnected :: (MonadRandom m, KnownNat i, KnownNat o)
                     => m (FullyConnected i o)
@ -52,5 +64,6 @@ randomFullyConnected = do
    s2 :: Int <- getRandom
    let wB = randomVector  s1 Uniform * 2 - 1
        wN = uniformSample s2 (-1) 1
+        bm = konst 0
        mm = konst 0
-    return $ FullyConnected wB wN mm
+    return $ FullyConnected wB bm wN mm
--- a/src/Grenade/Layers/Fuse.hs
+++ b/src/Grenade/Layers/Fuse.hs
@ -3,7 +3,6 @@
 {-# LANGUAGE GADTs                 #-}
 {-# LANGUAGE KindSignatures        #-}
 {-# LANGUAGE ScopedTypeVariables   #-}
-{-# LANGUAGE StandaloneDeriving    #-}
 {-# LANGUAGE TypeOperators         #-}
 {-# LANGUAGE TypeFamilies          #-}
 {-# LANGUAGE PolyKinds             #-}
@ -23,23 +22,30 @@ import           Grenade.Core.Shape
 --   This can be used to simplify a network if a complicated repeated structure is used.
 --   This does however have a trade off, internal incremental states in the Wengert tape are
 --   not retained during reverse accumulation. So less RAM is used, but more compute is required.
-data Fuse :: (* -> *) -> Shape -> Shape -> Shape -> * where
-    (:$$) :: (Show x, Show y, Layer m x i h, Layer m y h o, KnownShape h, KnownShape i, KnownShape o)
+data Fuse :: * -> * -> Shape -> Shape -> Shape -> * where
+    (:$$) :: (Show x, Show y, Layer x i h, Layer y h o, KnownShape h, KnownShape i, KnownShape o)
          => !x
          -> !y
-          -> Fuse m i h o
+          -> Fuse x y i h o
 infixr 5 :$$

-instance Show (Fuse m i h o) where
+instance Show (Fuse x y i h o) where
  show (x :$$ y) = "(" ++ show x ++ " :$$ " ++ show y ++ ")"

-instance (Monad m, KnownShape i, KnownShape h, KnownShape o) => Layer m (Fuse m i h o) i o where
-  runForwards (x :$$ y) input = do
-    yInput  :: S' h <- runForwards x input
-    runForwards y yInput
+instance (KnownShape i, KnownShape h, KnownShape o) => UpdateLayer (Fuse x y i h o) where
+  type Gradient (Fuse x y i h o) = (Gradient x, Gradient y)
+  runUpdate lr (x :$$ y) (x', y') =
+    let newX = runUpdate lr x x'
+        newY = runUpdate lr y y'
+    in (newX :$$ newY)

-  runBackards rate (x :$$ y) input backGradient = do
-    yInput  :: S' h <- runForwards x input
-    (y', yGrad)     <- runBackards rate y yInput backGradient
-    (x', xGrad)     <- runBackards rate x input yGrad
-    return (x' :$$ y', xGrad)
+instance (KnownShape i, KnownShape h, KnownShape o) => Layer (Fuse x y i h o) i o where
+  runForwards (x :$$ y) input =
+    let yInput  :: S' h = runForwards x input
+    in runForwards y yInput
+
+  runBackards (x :$$ y) input backGradient =
+    let yInput  :: S' h = runForwards x input
+        (y', yGrad)     = runBackards y yInput backGradient
+        (x', xGrad)     = runBackards x input yGrad
+    in ((x', y'), xGrad)
--- a/src/Grenade/Layers/Logit.hs
+++ b/src/Grenade/Layers/Logit.hs
@ -1,7 +1,5 @@
-{-# LANGUAGE BangPatterns          #-}
 {-# LANGUAGE DataKinds             #-}
 {-# LANGUAGE ScopedTypeVariables   #-}
-{-# LANGUAGE StandaloneDeriving    #-}
 {-# LANGUAGE TypeOperators         #-}
 {-# LANGUAGE TypeFamilies          #-}
 {-# LANGUAGE MultiParamTypeClasses #-}
@ -11,6 +9,7 @@ module Grenade.Layers.Logit (
    Logit (..)
  ) where

+
 import           Data.Singletons.TypeLits
 import           Grenade.Core.Network
 import           Grenade.Core.Vector
@ -23,17 +22,21 @@ import           Grenade.Core.Shape
 data Logit = Logit
  deriving Show

-instance (Monad m, KnownNat i) => Layer m Logit ('D1 i) ('D1 i) where
-  runForwards _ (S1D' y) = return $ S1D' (logistic y)
-  runBackards _ _ (S1D' y) (S1D' dEdy) = return (Logit, S1D' (logistic' y * dEdy))
+instance UpdateLayer Logit where
+  type Gradient Logit = ()
+  runUpdate _ _ _ = Logit

-instance (Monad m, KnownNat i, KnownNat j) => Layer m Logit ('D2 i j) ('D2 i j) where
-  runForwards _ (S2D' y) = return $ S2D' (logistic y)
-  runBackards _ _ (S2D' y) (S2D' dEdy) = return (Logit, S2D' (logistic' y * dEdy))
+instance (KnownNat i) => Layer Logit ('D1 i) ('D1 i) where
+  runForwards _ (S1D' y) = S1D' (logistic y)
+  runBackards _ (S1D' y) (S1D' dEdy) = ((), S1D' (logistic' y * dEdy))

-instance (Monad m, KnownNat i, KnownNat j, KnownNat k) => Layer m Logit ('D3 i j k) ('D3 i j k) where
-  runForwards _ (S3D' y) = return $ S3D' (fmap logistic y)
-  runBackards _ _ (S3D' y) (S3D' dEdy) = return (Logit, S3D' (vectorZip (\y' dEdy' -> logistic' y' * dEdy') y dEdy))
+instance (KnownNat i, KnownNat j) => Layer Logit ('D2 i j) ('D2 i j) where
+  runForwards _ (S2D' y) = S2D' (logistic y)
+  runBackards _ (S2D' y) (S2D' dEdy) = ((), S2D' (logistic' y * dEdy))
+
+instance (KnownNat i, KnownNat j, KnownNat k) => Layer Logit ('D3 i j k) ('D3 i j k) where
+  runForwards _ (S3D' y) =  S3D' (fmap logistic y)
+  runBackards _ (S3D' y) (S3D' dEdy) = ((), S3D' (vectorZip (\y' dEdy' -> logistic' y' * dEdy') y dEdy))


 logistic :: Floating a => a -> a
--- a/src/Grenade/Layers/Pad.hs
+++ b/src/Grenade/Layers/Pad.hs
@ -1,7 +1,5 @@
-{-# LANGUAGE BangPatterns          #-}
 {-# LANGUAGE DataKinds             #-}
 {-# LANGUAGE ScopedTypeVariables   #-}
-{-# LANGUAGE StandaloneDeriving    #-}
 {-# LANGUAGE GADTs                 #-}
 {-# LANGUAGE TypeOperators         #-}
 {-# LANGUAGE TypeFamilies          #-}
@ -39,9 +37,12 @@ data Pad  :: Nat
 instance Show (Pad padLeft padTop padRight padBottom) where
  show Pad = "Pad"

+instance UpdateLayer (Pad l t r b) where
+  type Gradient (Pad l t r b) = ()
+  runUpdate _ x _ = x
+
 -- | A two dimentional image can be padped.
-instance ( Monad m
-         , KnownNat padLeft
+instance ( KnownNat padLeft
         , KnownNat padTop
         , KnownNat padRight
         , KnownNat padBottom
@ -51,7 +52,7 @@ instance ( Monad m
         , KnownNat outputColumns
         , (inputRows + padTop + padBottom) ~ outputRows
         , (inputColumns + padLeft + padRight) ~ outputColumns
-         ) => Layer m (Pad padLeft padTop padRight padBottom) ('D2 inputRows inputColumns) ('D2 outputRows outputColumns) where
+         ) => Layer (Pad padLeft padTop padRight padBottom) ('D2 inputRows inputColumns) ('D2 outputRows outputColumns) where
  runForwards Pad (S2D' input) =
    let padl  = fromIntegral $ natVal (Proxy :: Proxy padLeft)
        padt  = fromIntegral $ natVal (Proxy :: Proxy padTop)
@ -59,12 +60,12 @@ instance ( Monad m
        padb  = fromIntegral $ natVal (Proxy :: Proxy padBottom)
        m     = extract input
        r     = diagBlock [konst 0 (padt,padl), m, konst 0 (padb,padr)]
-    in  return . S2D' . fromJust . create $ r
-  runBackards _ pad _ (S2D' dEdy) =
+    in  S2D' . fromJust . create $ r
+  runBackards Pad _ (S2D' dEdy) =
    let padl  = fromIntegral $ natVal (Proxy :: Proxy padLeft)
        padt  = fromIntegral $ natVal (Proxy :: Proxy padTop)
        nrows = fromIntegral $ natVal (Proxy :: Proxy inputRows)
        ncols = fromIntegral $ natVal (Proxy :: Proxy inputColumns)
        m     = extract dEdy
        vs    = subMatrix (padt, padl) (nrows, ncols) m
-    in  return (pad, S2D' . fromJust . create $ vs)
+    in  ((), S2D' . fromJust . create $ vs)
--- a/src/Grenade/Layers/Pooling.hs
+++ b/src/Grenade/Layers/Pooling.hs
@ -51,9 +51,12 @@ instance Show (Pooling k k' s s') where
  show Pooling = "Pooling"


+instance UpdateLayer (Pooling kernelRows kernelColumns strideRows strideColumns) where
+  type Gradient (Pooling kr kc sr sc) = ()
+  runUpdate _ Pooling _ = Pooling
+
 -- | A two dimentional image can be pooled.
-instance ( Monad m
-         , KnownNat kernelRows
+instance ( KnownNat kernelRows
         , KnownNat kernelColumns
         , KnownNat strideRows
         , KnownNat strideColumns
@ -63,7 +66,7 @@ instance ( Monad m
         , KnownNat outputColumns
         , ((outputRows - 1) * strideRows) ~ (inputRows - kernelRows)
         , ((outputColumns - 1) * strideColumns) ~ (inputColumns - kernelColumns)
-         ) => Layer m (Pooling kernelRows kernelColumns strideRows strideColumns) ('D2 inputRows inputColumns) ('D2 outputRows outputColumns) where
+         ) => Layer (Pooling kernelRows kernelColumns strideRows strideColumns) ('D2 inputRows inputColumns) ('D2 outputRows outputColumns) where
  runForwards Pooling (S2D' input) =
    let kx = fromIntegral $ natVal (Proxy :: Proxy kernelRows)
        ky = fromIntegral $ natVal (Proxy :: Proxy kernelColumns)
@ -74,8 +77,8 @@ instance ( Monad m
        ex = extract input
        r  = poolForward kx ky sx sy ox oy $ ex
        rs = fromJust . create $ r
-    in  return . S2D' $ rs
-  runBackards _ Pooling (S2D' input) (S2D' dEdy) =
+    in  S2D' $ rs
+  runBackards Pooling (S2D' input) (S2D' dEdy) =
    let kx = fromIntegral $ natVal (Proxy :: Proxy kernelRows)
        ky = fromIntegral $ natVal (Proxy :: Proxy kernelColumns)
        sx = fromIntegral $ natVal (Proxy :: Proxy strideRows)
@ -83,12 +86,11 @@ instance ( Monad m
        ex = extract input
        eo = extract dEdy
        vs = poolBackward kx ky sx sy ex eo
-    in  return (Pooling, S2D' . fromJust . create $ vs)
+    in  ((), S2D' . fromJust . create $ vs)


 -- | A three dimensional image can be pooled on each layer.
-instance ( Monad m
-         , KnownNat kernelRows
+instance ( KnownNat kernelRows
         , KnownNat kernelColumns
         , KnownNat strideRows
         , KnownNat strideColumns
@ -98,7 +100,7 @@ instance ( Monad m
         , KnownNat outputColumns
         , ((outputRows - 1) * strideRows) ~ (inputRows - kernelRows)
         , ((outputColumns - 1) * strideColumns) ~ (inputColumns - kernelColumns)
-         ) => Layer m (Pooling kernelRows kernelColumns strideRows strideColumns) ('D3 inputRows inputColumns channels) ('D3 outputRows outputColumns channels) where
+         ) => Layer (Pooling kernelRows kernelColumns strideRows strideColumns) ('D3 inputRows inputColumns channels) ('D3 outputRows outputColumns channels) where
  runForwards Pooling (S3D' input) =
    let ix = fromIntegral $ natVal (Proxy :: Proxy inputRows)
        iy = fromIntegral $ natVal (Proxy :: Proxy inputColumns)
@ -111,8 +113,8 @@ instance ( Monad m
        ex = fmap extract input
        r  = poolForwardList kx ky sx sy ix iy ox oy ex
        rs = fmap (fromJust . create) r
-    in  return . S3D' $ rs
-  runBackards _ Pooling (S3D' input) (S3D' dEdy) =
+    in  S3D' rs
+  runBackards Pooling (S3D' input) (S3D' dEdy) =
    let ix = fromIntegral $ natVal (Proxy :: Proxy inputRows)
        iy = fromIntegral $ natVal (Proxy :: Proxy inputColumns)
        kx = fromIntegral $ natVal (Proxy :: Proxy kernelRows)
@ -123,7 +125,7 @@ instance ( Monad m
        eo = fmap extract dEdy
        ez = vectorZip (,) ex eo
        vs = poolBackwardList kx ky sx sy ix iy ez
-    in  return (Pooling, S3D' . fmap (fromJust . create) $ vs)
+    in  ((), S3D' . fmap (fromJust . create) $ vs)

 poolForward :: Int -> Int -> Int -> Int -> Int -> Int -> Matrix Double -> Matrix Double
 poolForward nrows ncols srows scols outputRows outputCols m =
--- a/src/Grenade/Layers/Relu.hs
+++ b/src/Grenade/Layers/Relu.hs
@ -1,7 +1,5 @@
-{-# LANGUAGE BangPatterns          #-}
 {-# LANGUAGE DataKinds             #-}
 {-# LANGUAGE ScopedTypeVariables   #-}
-{-# LANGUAGE StandaloneDeriving    #-}
 {-# LANGUAGE TypeOperators         #-}
 {-# LANGUAGE TypeFamilies          #-}
 {-# LANGUAGE MultiParamTypeClasses #-}
@ -24,26 +22,30 @@ import qualified Numeric.LinearAlgebra.Static as LAS
 data Relu = Relu
  deriving Show

-instance (Monad m, KnownNat i) => Layer m Relu ('D1 i) ('D1 i) where
-  runForwards _ (S1D' y) = return $ S1D' (relu y)
+instance UpdateLayer Relu where
+  type Gradient Relu = ()
+  runUpdate _ _ _ = Relu
+
+instance ( KnownNat i) => Layer Relu ('D1 i) ('D1 i) where
+  runForwards _ (S1D' y) = S1D' (relu y)
    where
      relu = LAS.dvmap (\a -> if a <= 0 then 0 else a)
-  runBackards _ _ (S1D' y) (S1D' dEdy) = return (Relu, S1D' (relu' y * dEdy))
+  runBackards _ (S1D' y) (S1D' dEdy) = ((), S1D' (relu' y * dEdy))
    where
      relu' = LAS.dvmap (\a -> if a <= 0 then 0 else 1)

-instance (Monad m, KnownNat i, KnownNat j) => Layer m Relu ('D2 i j) ('D2 i j) where
-  runForwards _ (S2D' y) = return $ S2D' (relu y)
+instance (KnownNat i, KnownNat j) => Layer Relu ('D2 i j) ('D2 i j) where
+  runForwards _ (S2D' y) = S2D' (relu y)
    where
      relu = LAS.dmmap (\a -> if a <= 0 then 0 else a)
-  runBackards _ _ (S2D' y) (S2D' dEdy) = return (Relu, S2D' (relu' y * dEdy))
+  runBackards _ (S2D' y) (S2D' dEdy) = ((), S2D' (relu' y * dEdy))
    where
      relu' = LAS.dmmap (\a -> if a <= 0 then 0 else 1)

-instance (Monad m, KnownNat i, KnownNat j, KnownNat k) => Layer m Relu ('D3 i j k) ('D3 i j k) where
-  runForwards _ (S3D' y) = return $ S3D' (fmap relu y)
+instance (KnownNat i, KnownNat j, KnownNat k) => Layer Relu ('D3 i j k) ('D3 i j k) where
+  runForwards _ (S3D' y) = S3D' (fmap relu y)
    where
      relu = LAS.dmmap (\a -> if a <= 0 then 0 else a)
-  runBackards _ _ (S3D' y) (S3D' dEdy) = return (Relu, S3D' (vectorZip (\y' dEdy' -> relu' y' * dEdy') y dEdy))
+  runBackards _ (S3D' y) (S3D' dEdy) = ((), S3D' (vectorZip (\y' dEdy' -> relu' y' * dEdy') y dEdy))
    where
      relu' = LAS.dmmap (\a -> if a <= 0 then 0 else 1)
--- a/src/Grenade/Layers/Tanh.hs
+++ b/src/Grenade/Layers/Tanh.hs
@ -1,7 +1,5 @@
-{-# LANGUAGE BangPatterns          #-}
 {-# LANGUAGE DataKinds             #-}
 {-# LANGUAGE ScopedTypeVariables   #-}
-{-# LANGUAGE StandaloneDeriving    #-}
 {-# LANGUAGE TypeOperators         #-}
 {-# LANGUAGE TypeFamilies          #-}
 {-# LANGUAGE MultiParamTypeClasses #-}
@ -21,17 +19,21 @@ import           Grenade.Core.Shape
 data Tanh = Tanh
  deriving Show

-instance (Monad m, KnownNat i) => Layer m Tanh ('D1 i) ('D1 i) where
-  runForwards _ (S1D' y) = return $ S1D' (tanh y)
-  runBackards _ _ (S1D' y) (S1D' dEdy) = return (Tanh, S1D' (tanh' y * dEdy))
+instance UpdateLayer Tanh where
+  type Gradient Tanh = ()
+  runUpdate _ _ _ = Tanh

-instance (Monad m, KnownNat i, KnownNat j) => Layer m Tanh ('D2 i j) ('D2 i j) where
-  runForwards _ (S2D' y) = return $ S2D' (tanh y)
-  runBackards _ _ (S2D' y) (S2D' dEdy) = return (Tanh, S2D' (tanh' y * dEdy))
+instance KnownNat i => Layer Tanh ('D1 i) ('D1 i) where
+  runForwards _ (S1D' y) = S1D' (tanh y)
+  runBackards _ (S1D' y) (S1D' dEdy) = ((), S1D' (tanh' y * dEdy))

-instance (Monad m, KnownNat i, KnownNat j, KnownNat k) => Layer m Tanh ('D3 i j k) ('D3 i j k) where
-  runForwards _ (S3D' y) = return $ S3D' (fmap tanh y)
-  runBackards _ _ (S3D' y) (S3D' dEdy) = return (Tanh, S3D' (vectorZip (\y' dEdy' -> tanh' y' * dEdy') y dEdy))
+instance (KnownNat i, KnownNat j) => Layer Tanh ('D2 i j) ('D2 i j) where
+  runForwards _ (S2D' y) =  S2D' (tanh y)
+  runBackards _ (S2D' y) (S2D' dEdy) = ((), S2D' (tanh' y * dEdy))
+
+instance (KnownNat i, KnownNat j, KnownNat k) => Layer Tanh ('D3 i j k) ('D3 i j k) where
+  runForwards _ (S3D' y) = S3D' (fmap tanh y)
+  runBackards _ (S3D' y) (S3D' dEdy) = ((), S3D' (vectorZip (\y' dEdy' -> tanh' y' * dEdy') y dEdy))

 tanh' :: (Floating a) => a -> a
 tanh' t = 1 - s ^ (2 :: Int)  where s = tanh t
--- a/test/Test/Grenade/Layers/Convolution.hs
+++ b/test/Test/Grenade/Layers/Convolution.hs
@ -4,8 +4,6 @@
 {-# OPTIONS_GHC -fno-warn-missing-signatures #-}
 module Test.Grenade.Layers.Convolution where

-import           Control.Monad.Identity
-
 import           Grenade.Core.Shape
 import           Grenade.Core.Vector as Grenade
 import           Grenade.Core.Network
@ -93,11 +91,12 @@ prop_simple_conv_forwards = once $
                 , 0.0,  0.0,  0.0,  0.0
                 , 0.0,  0.0,  0.0,  0.0
                 , 0.0,  0.0,  0.0,  0.0 ] :: HStatic.L 4 4)
-      --expectedKernel = (HStatic.matrix
-      --           [ 0.0,  0.0,  0.0, -2.0
-      --           ,-2.0,  1.0,  1.0, -5.0
-      --           ,-3.0, -1.0,  1.0, -5.0
-      --           ,-5.0,  0.0,  0.0, -7.0 ] :: HStatic.L 4 4)
+
+      expectedGradient = (HStatic.matrix
+                 [ 1.0, 0.0, 0.0, 2.0
+                 , 2.0, 0.0, 0.0, 5.0
+                 , 3.0, 0.0, 0.0, 4.0
+                 , 4.0, 0.0, 0.0, 6.0 ] :: HStatic.L 4 4)

      convLayer = Convolution myKernel zeroKernel :: Convolution 1 4 2 2 1 1

@ -113,7 +112,7 @@ prop_simple_conv_forwards = once $
                 [  5.0 ,  9.0  ] :: HStatic.L 1 2)
               ,(HStatic.matrix
                 [ -7.0 , -10.0 ] :: HStatic.L 1 2)]) :: [HStatic.L 1 2]
-      out  = runIdentity $ runForwards convLayer input :: S' ('D3 1 2 4)
+      out  = runForwards convLayer input :: S' ('D3 1 2 4)

      grad =  S3D' ( mkVector
               [(HStatic.matrix
@ -128,12 +127,13 @@ prop_simple_conv_forwards = once $
      expectBack = (HStatic.matrix
                   [  1.0,  0.0, 0.0
                   ,  0.0, -2.0,-1.0] :: HStatic.L 2 3)
-      (nc, inX)  = runIdentity $ runBackards 1 convLayer input grad :: ( Convolution 1 4 2 2 1 1 ,  S' ('D2 2 3))
+      (nc, inX)  =  runBackards convLayer input grad

  in case (out, inX, nc) of
-    (S3D' out' , S2D' inX', Convolution _ _)
+    (S3D' out' , S2D' inX', Convolution' backGrad)
      -> ((HStatic.extract <$> expect) === (HStatic.extract <$> vecToList out'))
      .&&. ((HStatic.extract expectBack) === (HStatic.extract inX'))
+      .&&. ((HStatic.extract expectedGradient) === (HStatic.extract backGrad))
      -- Temporarily disabled, as l2 adjustment puts in off 5%
      -- .&&. HStatic.extract expectedKernel === HStatic.extract kernel'

@ -203,11 +203,12 @@ prop_single_conv_forwards = once $
                 , 0.0,  0.0,  0.0,  0.0
                 , 0.0,  0.0,  0.0,  0.0
                 , 0.0,  0.0,  0.0,  0.0 ] :: HStatic.L 4 4)
-      --expectedKernel = (HStatic.matrix
-      --           [ 0.0,  0.0,  0.0, -2.0
-      --           ,-2.0,  1.0,  1.0, -5.0
-      --           ,-3.0, -1.0,  1.0, -5.0
-      --           ,-5.0,  0.0,  0.0, -7.0 ] :: HStatic.L 4 4)
+
+      expectedGradient = (HStatic.matrix
+                 [ 1.0, 0.0, 0.0, 2.0
+                 , 2.0, 0.0, 0.0, 5.0
+                 , 3.0, 0.0, 0.0, 4.0
+                 , 4.0, 0.0, 0.0, 6.0 ] :: HStatic.L 4 4)

      convLayer = Convolution myKernel zeroKernel :: Convolution 1 4 2 2 1 1

@ -223,7 +224,7 @@ prop_single_conv_forwards = once $
                 [  5.0 ,  9.0  ] :: HStatic.L 1 2)
               ,(HStatic.matrix
                 [ -7.0 , -10.0 ] :: HStatic.L 1 2)]) :: [HStatic.L 1 2]
-      out  = runIdentity $ runForwards convLayer input :: S' ('D3 1 2 4)
+      out  = runForwards convLayer input :: S' ('D3 1 2 4)

      grad =  S3D' ( mkVector
               [(HStatic.matrix
@ -238,13 +239,13 @@ prop_single_conv_forwards = once $
      expectBack = (HStatic.matrix
                   [  1.0,  0.0, 0.0
                   ,  0.0, -2.0,-1.0] :: HStatic.L 2 3)
-      (nc, inX)  = runIdentity $ runBackards 1 convLayer input grad :: ( Convolution 1 4 2 2 1 1 ,  S' ('D3 2 3 1))
+      (nc, inX)  = runBackards convLayer input grad

  in case (out, inX, nc) of
-    (S3D' out' , S3D' inX', Convolution _ _)
+    (S3D' out' , S3D' inX', Convolution' backGrad)
      ->   ((HStatic.extract <$> expect)  === (HStatic.extract <$> vecToList out'))
      .&&. ([HStatic.extract expectBack]  === (HStatic.extract <$> vecToList inX'))
-      -- .&&. HStatic.extract expectedKernel === HStatic.extract kernel'
+      .&&. ((HStatic.extract expectedGradient) === (HStatic.extract backGrad))

 return []
 tests :: IO Bool
				`@ -0,0 +1 @@`
				`Subproject commit 3ea83a82b058ba2e1dd216d9e7832fd49cf33dbd`