diff --git a/src/main.nim b/src/main.nim
index e4cd191..01a5d39 100644
--- a/src/main.nim
+++ b/src/main.nim
@@ -2,9 +2,9 @@ import nn/network
 import nn/util/matrix
 
 
-var mlp = newNeuralNetwork(@[newDenseLayer(2, 3, Sigmoid), 
-                             newDenseLayer(3, 2, Sigmoid),
-                             newDenseLayer(2, 3, Softmax)],
+var mlp = newNeuralNetwork(@[newDenseLayer(784, 10, Sigmoid), 
+                             newDenseLayer(10, 16, Sigmoid),
+                             newDenseLayer(16, 10, Softmax)],
                            lossFunc=MSE, learnRate=0.05, momentum=0.55,
-                           weightRange=(start: -1.0, stop: 1.0), biasRange=(start: -1.0, stop: 1.0))
-echo mlp.feedforward(newMatrix[float](@[1.0, 2.0]))
+                           weightRange=(start: -1.0, stop: 1.0),
+                           biasRange=(start: -1.0, stop: 1.0))
diff --git a/src/nn/network.nim b/src/nn/network.nim
index 336e87a..1d38655 100644
--- a/src/nn/network.nim
+++ b/src/nn/network.nim
@@ -18,6 +18,7 @@ import util/matrix
 import std/strformat
 import std/random
 import std/math
+import std/sequtils
 
 
 randomize()
@@ -128,22 +129,94 @@ proc newNeuralNetwork*(topology: seq[Layer], lossFunc: Loss, learnRate: float, m
     result.momentum = momentum
 
 
-proc feedforward*(self: NeuralNetwork, data: Matrix[float]): Matrix[float] =
-    ## Feeds the given input through the network and returns
-    ## a 1D array with the output
-    when not defined(release):
-        if data.shape.rows > 1:
-            raise newException(ValueError, "input data must be one-dimensional")
-        if data.shape.cols != self.layers[0].inputSize:
-            raise newException(ValueError, &"input is of the wrong shape (expecting (1, {self.layers[0].inputSize}), got ({data.shape.rows}, {data.shape.cols}) instead)")
-    result = data
+proc backprop(self: NeuralNetwork, x, y: Matrix[float]): tuple[weights, biases: seq[Matrix[float]]] =
+    ## Performs a single backpropagation step and returns the
+    ## gradient of the cost function for the weights and biases 
+    ## of the network according to the given training sample
+    var
+        # The deltas for the weights and biases of
+        # each layer in the network
+        deltaW: seq[Matrix[float]] = @[]
+        deltaB: seq[Matrix[float]] = @[]
+        # Activations of each layer
+        activation = x
+        activations: seq[Matrix[float]] = @[x]
+        # Unactivated outputs of each layer
+        unactivated: seq[Matrix[float]] = @[]
+    # Forward pass through the network
     for layer in self.layers:
-        result = layer.activation.function(layer.weights.dot(result) + layer.biases)
+        deltaW.add(zeros[float](layer.weights.shape))
+        deltaB.add(zeros[float](layer.biases.shape))
+        unactivated.add(layer.weights.dot(activation) + layer.biases)
+        activations.add(layer.activation.function(unactivated[^1]))
+    # Backwards pass
+
+    # The negative gradient of each layer for this sample: this is a 
+    # partial derivative, so the multiplication here is just an 
+    # application of the chain rule!
+    var diff: Matrix[float] = self.loss.derivative(activations[^1], y) * self.layers[^1].activation.derivative(unactivated[^1])
+    deltaB[^1].replace(diff)
+    deltaW[^1].replace(activations[^2].transpose())
+    for l in 2..self.layers.high():
+        # The ^ makes our indeces start from the back instead of
+        # from the front, so we're really iterating over our layers
+        # backwards!
+        diff = self.layers[^l].weights.transpose.dot(diff) * self.layers[^l].activation.derivative(unactivated[^l])
+        deltaB[^l].replace(diff)
+        deltaW[^l].replace(diff.dot(activations[^(l - 1)].transpose()))
+    return (deltaW, deltaB)
 
 
-proc backprop(self: NeuralNetwork, x, y: Matrix[float]) {.used.} =
-    ## Performs a single backpropagation step and updates the
-    ## gradients for our weights and biases, layer by layer
+proc miniBatch(self: NeuralNetwork, data: seq[tuple[x, y: Matrix[float]]]) =
+    ## Performs a single mini-batch step in stochastic gradient
+    ## descent and updates the network's weights and biases
+    ## accordingly
+    var gradient: tuple[weights, biases: seq[Matrix[float]]]
+    # New weights and biases
+    var 
+        weights: seq[Matrix[float]] = @[]
+        biases: seq[Matrix[float]] = @[]
+    for layer in self.layers:
+        weights.add(zeros[float](layer.weights.shape))
+        biases.add(zeros[float](layer.biases.shape))
+    for dataPoint in data:
+        gradient = self.backprop(dataPoint.x, dataPoint.y)
+        for i, (currentBiases, newBiases) in zip(biases, gradient.biases):
+            biases[i] = currentBiases + newBiases
+        for i, (currentWeights, newWeights) in zip(weights, gradient.weights):
+            weights[i] = currentWeights + newWeights
+    # The backpropagation algorithm lets us find the gradient of steepest ascent
+    # in our cost function, so we subtract it from the current weights and biases
+    # to descend it the fastest (it's not actually *the* fastest because true gradient
+    # descent would perform this over all training samples, but it's a pretty good
+    # approximation nonetheless, it converges quickly and it actually helps prevent
+    # overfitting by not letting the network train over the same data over and over
+    # again)
+    for (layer, newBiases) in zip(self.layers, biases):
+        layer.biases = layer.biases - (self.learnRate / data.len().float) * newBiases
+    for (layer, newWeights) in zip(self.layers, weights):
+        layer.weights = layer.weights - (self.learnRate / data.len().float) * newWeights
+
+
+proc train*(self: NeuralNetwork, epochs: int, batchSize: int, data: var seq[tuple[x, y: Matrix[float]]]) =
+    ## Train the network on the given data for the speficied 
+    ## number of epochs using the given batch size by applying
+    ## stochastic gradient descent
+    var batches: seq[seq[tuple[x, y: Matrix[float]]]]
+    for epoch in 0..<epochs:
+        # We shuffle the data so that different epochs work
+        # on different data points. This will hopefully help
+        # the network generalize its training onto unseen data
+        shuffle(data)
+        batches = @[]
+        var i = 0
+        while i < data.len():
+            batches.add(@[])
+            for j in 0..<batchSize:
+                batches[^1].add(data[i])
+            i += batchSize
+        for batch in batches:
+            self.miniBatch(batch)
 
 
 ## Utility functions
diff --git a/src/nn/util/matrix.nim b/src/nn/util/matrix.nim
index c7c0edc..708895c 100644
--- a/src/nn/util/matrix.nim
+++ b/src/nn/util/matrix.nim
@@ -425,7 +425,6 @@ proc copy*[T](self: MatrixView[T]): Matrix[T] =
     for e in self:
         result.data[].add(e)
     result.shape = self.shape
-    result.m = self.m
 
 
 proc dup*[T](self: MatrixView[T]): MatrixView[T] =
@@ -462,6 +461,20 @@ proc `/`*[T](a: Matrix[T], b: T): Matrix[T] = a.copy().apply(divide, b, axis= -1
 proc `/`*[T](a: T, b: Matrix[T]): Matrix[T] = b.copy().apply(divide, a, axis= -1)
 
 
+proc `+`*[T](a: MatrixView[T], b: T): Matrix[T] = a.copy().apply(add, b, axis= -1)
+proc `+`*[T](a: T, b: MatrixView[T]): Matrix[T] = b.copy().apply(add, a, axis= -1)
+
+proc `-`*[T](a: MatrixView[T], b: T): Matrix[T] = a.copy().apply(sub, b, axis= -1)
+proc `-`*[T](a: T, b: MatrixView[T]): Matrix[T] = b.copy().apply(sub, a, axis= -1)
+proc `-`*[T](a: MatrixView[T]): Matrix[T] = a.copy().apply(neg, a, axis= -1)
+
+proc `*`*[T](a: MatrixView[T], b: T): Matrix[T] = a.copy().apply(mul, b, axis = -1)
+proc `*`*[T](a: T, b: MatrixView[T]): Matrix[T] = b.copy().apply(mul, a, axis= -1)
+
+proc `/`*[T](a: MatrixView[T], b: T): Matrix[T] = a.copy().apply(divide, b, axis= -1)
+proc `/`*[T](a: T, b: MatrixView[T]): Matrix[T] = b.copy().apply(divide, a, axis= -1)
+
+
 # matrix/matrix operations. They produce a new matrix with the
 # result of the operation
 
@@ -942,6 +955,23 @@ proc dot*[T](self, other: Matrix[T]): Matrix[T] =
         return self * other
 
 
+proc dot*[T](self: MatrixView[T], other: Matrix[T]): Matrix[T] =
+    ## Computes the dot product of the two
+    ## input matrices
+    when not defined(release):
+        if self.shape.cols != other.shape.cols:
+            raise newException(ValueError, &"incompatible argument shapes for dot product")
+    result = zeros[T]((0, self.shape.rows))
+    for i in 0..<result.shape.cols:
+        result[0, i] = (other[0] * self[i]).sum()
+
+
+proc dot*[T](self: Matrix[T], other: MatrixView[T]): Matrix[T] {.inline.} = result = other.dot(self)
+
+
+proc dot*[T](self, other: MatrixView[T]): T = (self * other).sum()
+
+    
 proc where*[T](cond: Matrix[bool], x, y: Matrix[T]): Matrix[T] =
     ## Return elements chosen from x or y depending on cond
     ## Where cond is true, take elements from x, otherwise
@@ -1046,6 +1076,18 @@ proc count*[T](self: Matrix[T], e: T): int =
                 inc(result) 
 
 
+proc replace*[T](self: Matrix[T], other: Matrix[T], copy: bool = false) =
+    ## Replaces the data in self with the data from
+    ## other (a copy is not performed unless copy equals
+    ## true). A reference to the object is returned
+    if copy:
+        self.data[] = other.data[]
+    else:
+        self.data = other.data
+    self.order = other.order
+    self.shape = other.shape
+
+
 when isMainModule:
     import math