Implemented momentum

2023-03-21 19:10:30 +01:00 · 2023-03-21 19:10:30 +01:00 · e3265fac68
parent 9baede9b54
commit e3265fac68
1 changed files with 20 additions and 2 deletions
--- a/src/nn/network.nim
+++ b/src/nn/network.nim
@ -215,6 +215,24 @@ proc miniBatch(self: NeuralNetwork, data: seq[tuple[x, y: Matrix[float]]]) =
            biases[i] = currentBiases + newBiases
        for i, (currentWeights, newWeights) in zip(weights, gradient.weights):
            weights[i] = currentWeights + newWeights
    # We use hyperparameters such as the learn rate and momentum
    # to further control how fast (or slowly) the network converges
    # onto a local minimum of the gradient of our loss function. To
    # be completely honest I'm not entirely sure why we're dividing the
    # learn rate by the size of our batch (if you didn't already notice I
    # stole a lot of this code. I swear I'm a good programmer. Please hire
    # me): my best guess would be that this way it gets "normalized" (as 
    # if we were training on the entire dataset at once even though we 
    # aren't) when it's < 1 and are otherwise scaling it to the size of 
    # our batch when it's > 1. I have some vague ideas as to why that may
    # make sense, but it's a wild guess really
    var nudge = self.learnRate / data.len().float
    if self.momentum > 0:
        # I _could_ go look at how other libraries implement
        # momentum, OR I could pull a formula out of my ass
        # and hope it works. Let's run with that, hm?
        nudge *= (1 / self.momentum)
    # The backpropagation algorithm lets us find the direction of steepest ascent
    # in the gradient of our cost function (which, remember, we're trying to minimize
    # by climbing it down), so we subtract that from the current weights and biases
@ -224,9 +242,9 @@ proc miniBatch(self: NeuralNetwork, data: seq[tuple[x, y: Matrix[float]]]) =
    # overfitting by not letting the network train over the same data over and over
    # again)
    for (layer, newBiases) in zip(self.layers, biases):
-        layer.biases = layer.biases - (self.learnRate / data.len().float) * newBiases
+        layer.biases = (layer.biases - nudge) * newBiases
    for (layer, newWeights) in zip(self.layers, weights):
-        layer.weights = layer.weights - (self.learnRate / data.len().float) * newWeights
+        layer.weights = (layer.weights - nudge) * newWeights
 proc train*(self: NeuralNetwork, epochs: int, batchSize: int, data: var seq[tuple[x, y: Matrix[float]]]) =