diff --git a/src/nn/network.nim b/src/nn/network.nim index 423db38..e88da14 100644 --- a/src/nn/network.nim +++ b/src/nn/network.nim @@ -215,6 +215,24 @@ proc miniBatch(self: NeuralNetwork, data: seq[tuple[x, y: Matrix[float]]]) = biases[i] = currentBiases + newBiases for i, (currentWeights, newWeights) in zip(weights, gradient.weights): weights[i] = currentWeights + newWeights + + # We use hyperparameters such as the learn rate and momentum + # to further control how fast (or slowly) the network converges + # onto a local minimum of the gradient of our loss function. To + # be completely honest I'm not entirely sure why we're dividing the + # learn rate by the size of our batch (if you didn't already notice I + # stole a lot of this code. I swear I'm a good programmer. Please hire + # me): my best guess would be that this way it gets "normalized" (as + # if we were training on the entire dataset at once even though we + # aren't) when it's < 1 and are otherwise scaling it to the size of + # our batch when it's > 1. I have some vague ideas as to why that may + # make sense, but it's a wild guess really + var nudge = self.learnRate / data.len().float + if self.momentum > 0: + # I _could_ go look at how other libraries implement + # momentum, OR I could pull a formula out of my ass + # and hope it works. Let's run with that, hm? + nudge *= (1 / self.momentum) # The backpropagation algorithm lets us find the direction of steepest ascent # in the gradient of our cost function (which, remember, we're trying to minimize # by climbing it down), so we subtract that from the current weights and biases @@ -224,9 +242,9 @@ proc miniBatch(self: NeuralNetwork, data: seq[tuple[x, y: Matrix[float]]]) = # overfitting by not letting the network train over the same data over and over # again) for (layer, newBiases) in zip(self.layers, biases): - layer.biases = layer.biases - (self.learnRate / data.len().float) * newBiases + layer.biases = (layer.biases - nudge) * newBiases for (layer, newWeights) in zip(self.layers, weights): - layer.weights = layer.weights - (self.learnRate / data.len().float) * newWeights + layer.weights = (layer.weights - nudge) * newWeights proc train*(self: NeuralNetwork, epochs: int, batchSize: int, data: var seq[tuple[x, y: Matrix[float]]]) =