diff --git a/README.md b/README.md
index 5f6b56d..cde6227 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,11 @@
 # NNExperiments
 
+AI stuff. 
+
+## TODOs
+
+- Regularization (L1/L2)
+- Implement momentum
+- Optimize matrix multiplication
+- ???
+- Profit
diff --git a/src/main.nim b/src/main.nim
index 01a5d39..195d535 100644
--- a/src/main.nim
+++ b/src/main.nim
@@ -2,9 +2,11 @@ import nn/network
 import nn/util/matrix
 
 
-var mlp = newNeuralNetwork(@[newDenseLayer(784, 10, Sigmoid), 
-                             newDenseLayer(10, 16, Sigmoid),
-                             newDenseLayer(16, 10, Softmax)],
-                           lossFunc=MSE, learnRate=0.05, momentum=0.55,
+var mlp = newNeuralNetwork(@[newDenseLayer(784, 10), 
+                             newDenseLayer(10, 16),
+                             newDenseLayer(16, 10)],
+                           lossFunc=MSE, activationFunc=Softmax,
+                           learnRate=0.05, momentum=0.55,
                            weightRange=(start: -1.0, stop: 1.0),
                            biasRange=(start: -1.0, stop: 1.0))
+
diff --git a/src/nn/network.nim b/src/nn/network.nim
index 1d38655..423db38 100644
--- a/src/nn/network.nim
+++ b/src/nn/network.nim
@@ -29,7 +29,8 @@ type
         ## A generic feed-forward 
         ## neural network
         layers*: seq[Layer]
-        loss: Loss                            # The cost function along with its derivative
+        loss: Loss                            # The network's cost function
+        activation: Activation                # The network's activation function
         # The network's learn rate determines
         # the amount of progress that is made
         # at each step when performing gradient
@@ -41,23 +42,25 @@ type
         # we nudge our inputs for our next epoch
         momentum*: float
     Loss* = ref object
-        ## A loss function and its derivative
+        ## A vectorized loss function and its derivative
         function: proc (a, b: Matrix[float]): float
         derivative: proc (x, y: Matrix[float]): Matrix[float] {.noSideEffect.}
     Activation* = ref object
-        ## An activation function
+        ## A vectorized activation function and its
+        ## derivative
         function: proc (input: Matrix[float]): Matrix[float] {.noSideEffect.}
         derivative: proc (x: Matrix[float]): Matrix[float] {.noSideEffect.}
+    LayerKind* = enum
+        ## A layer enumeration
+        Dense, Dropout, Sparse
     Layer* = ref object
         ## A generic neural network
         ## layer
+        kind*: LayerKind                                     # TODO (add dropout and sparse layer!)
         inputSize*: int                                      # The number of inputs we process
         outputSize*: int                                     # The number of outputs we produce
         weights*: Matrix[float]                              # The weights for each connection (2D)
         biases*: Matrix[float]                               # The biases for each neuron (1D)
-        gradients: tuple[weights, biases: Matrix[float]]     # Gradient coefficients for weights and biases
-        activation: Activation                               # The layer's activation function
-
 
 
 proc `$`*(self: Layer): string =
@@ -86,82 +89,109 @@ proc newActivation*(function: proc (input: Matrix[float]): Matrix[float] {.noSid
     result.derivative = derivative
 
 
-proc newDenseLayer*(inputSize: int, outputSize: int, activationFunc: Activation): Layer =
+proc newDenseLayer*(inputSize: int, outputSize: int): Layer =
     ## Creates a new dense layer with inputSize input 
-    ## parameters and outputSize outgoing outputs and
-    ## using the chosen activation function.
+    ## parameters and outputSize outgoing outputs.
     new(result)
     result.inputSize = inputSize
     result.outputSize = outputSize
-    result.activation = activationFunc
+    result.kind = Dense
 
 
-proc newNeuralNetwork*(topology: seq[Layer], lossFunc: Loss, learnRate: float, momentum: float,
-                       weightRange, biasRange: tuple[start, stop: float]): NeuralNetwork =
+proc newNeuralNetwork*(topology: seq[Layer], lossFunc: Loss, activationFunc: Activation, 
+                       learnRate: float, momentum: float, weightRange, 
+                       biasRange: tuple[start, stop: float]): NeuralNetwork =
     ## Initializes a new neural network with
     ## the given topology and hyperparameters.
     ## Weights and biases are initialized with 
-    ## random values in the chosen range
+    ## random values in the chosen range using
+    ## nim's default PRNG
     new(result)
     result.layers = topology
     for layer in result.layers:
         var biases = newSeqOfCap[float](layer.outputSize)
-        var biasGradients = newSeqOfCap[float](layer.outputSize)
         for _ in 0..<layer.outputSize:
             biases.add(rand(biasRange.start..biasRange.stop))
-            biasGradients.add(0.0)
         var weights = newSeqOfCap[float](layer.inputSize * layer.outputSize)
-        var weightGradients = newSeqOfCap[float](layer.inputSize * layer.outputSize)
         for _ in 0..<layer.outputSize:
             for _ in 0..<layer.inputSize:
                 weights.add(rand(weightRange.start..weightRange.stop))
-                weightGradients.add(0.0)
         layer.biases = newMatrix[float](biases)
         # Why swap outputSize and inputSize in the matrix shape? The reason is simple: this
         # spares us from having to transpose it later when we perform the dot product (I get
         # that it's a constant time operation, but if we can avoid it altogether, that's even
         # better!)
         layer.weights = newMatrixFromSeq[float](weights, (layer.outputSize, layer.inputSize))
-        layer.gradients = (weights: newMatrix[float](weightGradients),
-                           biases: newMatrixFromSeq[float](biasGradients, (layer.outputSize, layer.inputSize)))
     result.loss = lossFunc
+    result.activation = activationFunc
     result.learnRate = learnRate
     result.momentum = momentum
 
 
+proc feed(self: Layer, x: Matrix[float]): Matrix[float] =
+    ## Feeds the given input to the layer.
+    ## The layer's output is returned
+    result = self.weights.dot(x) + self.biases
+
+
+proc fastFeedForward(self: NeuralNetwork, x: Matrix[float]): Matrix[float] {.used.} =
+    ## Feeds the given input through the network. The
+    ## (unactivated) output from the last layer is returned
+    result = x
+    for layer in self.layers:
+        result = layer.feed(result)
+
+
+proc feedForward(self: NeuralNetwork, x: Matrix[float]): seq[Matrix[float]] =
+    ## Feeds the given input through the network.
+    ## All unactivated outputs from each layer are
+    ## returned in order
+    result.add(x)
+    for layer in self.layers:
+        result.add(layer.feed(result[^1]))
+
+# TODO: Consider optimizing this to take n m-length vectors in an m*n matrix instead
+# of calling it with every sample in every mini-batch to offload the heavylifting
+# to the matrix library (this hasn't been done yet, both for simplicity purposes and 
+# also because the matrix library we use is horribly inefficient anyway)
 proc backprop(self: NeuralNetwork, x, y: Matrix[float]): tuple[weights, biases: seq[Matrix[float]]] =
-    ## Performs a single backpropagation step and returns the
-    ## gradient of the cost function for the weights and biases 
-    ## of the network according to the given training sample
+    ## Performs a single backpropagation step with the given
+    ## training sample and returns the direction of steepest 
+    ## ascent for the gradient of the network's cost function 
+    ## w.r.t the weights and biases
     var
         # The deltas for the weights and biases of
         # each layer in the network
         deltaW: seq[Matrix[float]] = @[]
         deltaB: seq[Matrix[float]] = @[]
         # Activations of each layer
-        activation = x
-        activations: seq[Matrix[float]] = @[x]
+        activations: seq[Matrix[float]] = @[]
         # Unactivated outputs of each layer
         unactivated: seq[Matrix[float]] = @[]
-    # Forward pass through the network
+
+    # Initialize all of our deltas to zero
     for layer in self.layers:
         deltaW.add(zeros[float](layer.weights.shape))
         deltaB.add(zeros[float](layer.biases.shape))
-        unactivated.add(layer.weights.dot(activation) + layer.biases)
-        activations.add(layer.activation.function(unactivated[^1]))
-    # Backwards pass
 
-    # The negative gradient of each layer for this sample: this is a 
-    # partial derivative, so the multiplication here is just an 
-    # application of the chain rule!
-    var diff: Matrix[float] = self.loss.derivative(activations[^1], y) * self.layers[^1].activation.derivative(unactivated[^1])
+    # Forward pass through the network
+    unactivated = self.feedForward(x)
+    # TODO: This can probably be optimized
+    for unact in unactivated:
+        activations.add(self.activation.function(unact))
+
+    # This stores the gradient of each layer for this sample: since it is a 
+    # partial derivative the multiplication here is just an application of the 
+    # chain rule (Because while the cost function does indeed depend on the 
+    # weights and biases, they aren't explicit arguments to it, which means we 
+    # have to do fancy calculus stuff to figure out the derivative)
+    var diff: Matrix[float] = self.loss.derivative(activations[^1], y) * self.activation.derivative(unactivated[^1])
     deltaB[^1].replace(diff)
     deltaW[^1].replace(activations[^2].transpose())
+    # Backwards pass (actually the backwards pass began two lines earlier, we're just feeding
+    # the correction back through the rest of the network now)
     for l in 2..self.layers.high():
-        # The ^ makes our indeces start from the back instead of
-        # from the front, so we're really iterating over our layers
-        # backwards!
-        diff = self.layers[^l].weights.transpose.dot(diff) * self.layers[^l].activation.derivative(unactivated[^l])
+        diff = self.layers[^l].weights.transpose.dot(diff) * self.activation.derivative(unactivated[^l])
         deltaB[^l].replace(diff)
         deltaW[^l].replace(diff.dot(activations[^(l - 1)].transpose()))
     return (deltaW, deltaB)
@@ -185,8 +215,9 @@ proc miniBatch(self: NeuralNetwork, data: seq[tuple[x, y: Matrix[float]]]) =
             biases[i] = currentBiases + newBiases
         for i, (currentWeights, newWeights) in zip(weights, gradient.weights):
             weights[i] = currentWeights + newWeights
-    # The backpropagation algorithm lets us find the gradient of steepest ascent
-    # in our cost function, so we subtract it from the current weights and biases
+    # The backpropagation algorithm lets us find the direction of steepest ascent
+    # in the gradient of our cost function (which, remember, we're trying to minimize
+    # by climbing it down), so we subtract that from the current weights and biases
     # to descend it the fastest (it's not actually *the* fastest because true gradient
     # descent would perform this over all training samples, but it's a pretty good
     # approximation nonetheless, it converges quickly and it actually helps prevent
@@ -237,18 +268,26 @@ func sigmoidDerivative(input: Matrix[float]): Matrix[float] = sigmoid(input) * (
 
 
 func softmax(input: Matrix[float]): Matrix[float] = 
+    # This is the good kind of softmax (stole it from
+    # stackoverflow lol) which means it doesn't violently
+    # detonate if the input gets too large because
+    # of the exponentials. I love the internet!
     var input = input - input.max()
     result = input.apply(math.exp, axis = -1) / input.apply(math.exp, axis = -1).sum()
 
 func softmaxDerivative(input: Matrix[float]): Matrix[float] =
+    # I stole this too, by the way
     var input = input.reshape(input.shape.cols, 1)
+    # I _love_ stealing functions from numpy!
     result = input.diagflat() - input.dot(input.transpose())
 
+func relu(input: Matrix[float]): Matrix[float] {.used.} = input.apply(proc (x: float): float = max(0.0, x), axis = -1)
+func dxRelu(input: Matrix[float]): Matrix[float] = input.where(input > 0.0, 0.0)
 
 # TODO: Add derivatives for this stuff
 func step(input: Matrix[float]): Matrix[float] {.used.} = input.apply(proc (x: float): float = (if x < 0.0: 0.0 else: x), axis = -1)
 func silu(input: Matrix[float]): Matrix[float] {.used.} = input.apply(proc (x: float): float = 1 / (1 + exp(-x)), axis= -1)
-func relu(input: Matrix[float]): Matrix[float] {.used.} = input.apply(proc (x: float): float = max(0.0, x), axis = -1)
+
 
 func htan(input: Matrix[float]): Matrix[float] {.used.} = 
     let f = proc (x: float): float = 
@@ -260,6 +299,7 @@ func htan(input: Matrix[float]): Matrix[float] {.used.} =
 {.hints: off.}   # So nim doesn't complain about the naming
 var Sigmoid* = newActivation(sigmoid, sigmoidDerivative)
 var Softmax* = newActivation(softmax, softmaxDerivative)
+var ReLU* = newActivation(relu, dxRelu)
 var MSE* = newLoss(mse, dxMSE)
 {.pop.}
 
diff --git a/src/nn/util/matrix.nim b/src/nn/util/matrix.nim
index 708895c..4e58d7f 100644
--- a/src/nn/util/matrix.nim
+++ b/src/nn/util/matrix.nim
@@ -1079,7 +1079,7 @@ proc count*[T](self: Matrix[T], e: T): int =
 proc replace*[T](self: Matrix[T], other: Matrix[T], copy: bool = false) =
     ## Replaces the data in self with the data from
     ## other (a copy is not performed unless copy equals
-    ## true). A reference to the object is returned
+    ## true)
     if copy:
         self.data[] = other.data[]
     else: