diff --git a/src/nn/network.nim b/src/nn/network.nim
index 423db38..e88da14 100644
--- a/src/nn/network.nim
+++ b/src/nn/network.nim
@@ -215,6 +215,24 @@ proc miniBatch(self: NeuralNetwork, data: seq[tuple[x, y: Matrix[float]]]) =
             biases[i] = currentBiases + newBiases
         for i, (currentWeights, newWeights) in zip(weights, gradient.weights):
             weights[i] = currentWeights + newWeights
+    
+    # We use hyperparameters such as the learn rate and momentum
+    # to further control how fast (or slowly) the network converges
+    # onto a local minimum of the gradient of our loss function. To
+    # be completely honest I'm not entirely sure why we're dividing the
+    # learn rate by the size of our batch (if you didn't already notice I
+    # stole a lot of this code. I swear I'm a good programmer. Please hire
+    # me): my best guess would be that this way it gets "normalized" (as 
+    # if we were training on the entire dataset at once even though we 
+    # aren't) when it's < 1 and are otherwise scaling it to the size of 
+    # our batch when it's > 1. I have some vague ideas as to why that may
+    # make sense, but it's a wild guess really
+    var nudge = self.learnRate / data.len().float
+    if self.momentum > 0:
+        # I _could_ go look at how other libraries implement
+        # momentum, OR I could pull a formula out of my ass
+        # and hope it works. Let's run with that, hm?
+        nudge *= (1 / self.momentum)
     # The backpropagation algorithm lets us find the direction of steepest ascent
     # in the gradient of our cost function (which, remember, we're trying to minimize
     # by climbing it down), so we subtract that from the current weights and biases
@@ -224,9 +242,9 @@ proc miniBatch(self: NeuralNetwork, data: seq[tuple[x, y: Matrix[float]]]) =
     # overfitting by not letting the network train over the same data over and over
     # again)
     for (layer, newBiases) in zip(self.layers, biases):
-        layer.biases = layer.biases - (self.learnRate / data.len().float) * newBiases
+        layer.biases = (layer.biases - nudge) * newBiases
     for (layer, newWeights) in zip(self.layers, weights):
-        layer.weights = layer.weights - (self.learnRate / data.len().float) * newWeights
+        layer.weights = (layer.weights - nudge) * newWeights
 
 
 proc train*(self: NeuralNetwork, epochs: int, batchSize: int, data: var seq[tuple[x, y: Matrix[float]]]) =