diff --git a/src/main.nim b/src/main.nim index e4cd191..01a5d39 100644 --- a/src/main.nim +++ b/src/main.nim @@ -2,9 +2,9 @@ import nn/network import nn/util/matrix -var mlp = newNeuralNetwork(@[newDenseLayer(2, 3, Sigmoid), - newDenseLayer(3, 2, Sigmoid), - newDenseLayer(2, 3, Softmax)], +var mlp = newNeuralNetwork(@[newDenseLayer(784, 10, Sigmoid), + newDenseLayer(10, 16, Sigmoid), + newDenseLayer(16, 10, Softmax)], lossFunc=MSE, learnRate=0.05, momentum=0.55, - weightRange=(start: -1.0, stop: 1.0), biasRange=(start: -1.0, stop: 1.0)) -echo mlp.feedforward(newMatrix[float](@[1.0, 2.0])) + weightRange=(start: -1.0, stop: 1.0), + biasRange=(start: -1.0, stop: 1.0)) diff --git a/src/nn/network.nim b/src/nn/network.nim index 336e87a..1d38655 100644 --- a/src/nn/network.nim +++ b/src/nn/network.nim @@ -18,6 +18,7 @@ import util/matrix import std/strformat import std/random import std/math +import std/sequtils randomize() @@ -128,22 +129,94 @@ proc newNeuralNetwork*(topology: seq[Layer], lossFunc: Loss, learnRate: float, m result.momentum = momentum -proc feedforward*(self: NeuralNetwork, data: Matrix[float]): Matrix[float] = - ## Feeds the given input through the network and returns - ## a 1D array with the output - when not defined(release): - if data.shape.rows > 1: - raise newException(ValueError, "input data must be one-dimensional") - if data.shape.cols != self.layers[0].inputSize: - raise newException(ValueError, &"input is of the wrong shape (expecting (1, {self.layers[0].inputSize}), got ({data.shape.rows}, {data.shape.cols}) instead)") - result = data +proc backprop(self: NeuralNetwork, x, y: Matrix[float]): tuple[weights, biases: seq[Matrix[float]]] = + ## Performs a single backpropagation step and returns the + ## gradient of the cost function for the weights and biases + ## of the network according to the given training sample + var + # The deltas for the weights and biases of + # each layer in the network + deltaW: seq[Matrix[float]] = @[] + deltaB: seq[Matrix[float]] = @[] + # Activations of each layer + activation = x + activations: seq[Matrix[float]] = @[x] + # Unactivated outputs of each layer + unactivated: seq[Matrix[float]] = @[] + # Forward pass through the network for layer in self.layers: - result = layer.activation.function(layer.weights.dot(result) + layer.biases) + deltaW.add(zeros[float](layer.weights.shape)) + deltaB.add(zeros[float](layer.biases.shape)) + unactivated.add(layer.weights.dot(activation) + layer.biases) + activations.add(layer.activation.function(unactivated[^1])) + # Backwards pass + + # The negative gradient of each layer for this sample: this is a + # partial derivative, so the multiplication here is just an + # application of the chain rule! + var diff: Matrix[float] = self.loss.derivative(activations[^1], y) * self.layers[^1].activation.derivative(unactivated[^1]) + deltaB[^1].replace(diff) + deltaW[^1].replace(activations[^2].transpose()) + for l in 2..self.layers.high(): + # The ^ makes our indeces start from the back instead of + # from the front, so we're really iterating over our layers + # backwards! + diff = self.layers[^l].weights.transpose.dot(diff) * self.layers[^l].activation.derivative(unactivated[^l]) + deltaB[^l].replace(diff) + deltaW[^l].replace(diff.dot(activations[^(l - 1)].transpose())) + return (deltaW, deltaB) -proc backprop(self: NeuralNetwork, x, y: Matrix[float]) {.used.} = - ## Performs a single backpropagation step and updates the - ## gradients for our weights and biases, layer by layer +proc miniBatch(self: NeuralNetwork, data: seq[tuple[x, y: Matrix[float]]]) = + ## Performs a single mini-batch step in stochastic gradient + ## descent and updates the network's weights and biases + ## accordingly + var gradient: tuple[weights, biases: seq[Matrix[float]]] + # New weights and biases + var + weights: seq[Matrix[float]] = @[] + biases: seq[Matrix[float]] = @[] + for layer in self.layers: + weights.add(zeros[float](layer.weights.shape)) + biases.add(zeros[float](layer.biases.shape)) + for dataPoint in data: + gradient = self.backprop(dataPoint.x, dataPoint.y) + for i, (currentBiases, newBiases) in zip(biases, gradient.biases): + biases[i] = currentBiases + newBiases + for i, (currentWeights, newWeights) in zip(weights, gradient.weights): + weights[i] = currentWeights + newWeights + # The backpropagation algorithm lets us find the gradient of steepest ascent + # in our cost function, so we subtract it from the current weights and biases + # to descend it the fastest (it's not actually *the* fastest because true gradient + # descent would perform this over all training samples, but it's a pretty good + # approximation nonetheless, it converges quickly and it actually helps prevent + # overfitting by not letting the network train over the same data over and over + # again) + for (layer, newBiases) in zip(self.layers, biases): + layer.biases = layer.biases - (self.learnRate / data.len().float) * newBiases + for (layer, newWeights) in zip(self.layers, weights): + layer.weights = layer.weights - (self.learnRate / data.len().float) * newWeights + + +proc train*(self: NeuralNetwork, epochs: int, batchSize: int, data: var seq[tuple[x, y: Matrix[float]]]) = + ## Train the network on the given data for the speficied + ## number of epochs using the given batch size by applying + ## stochastic gradient descent + var batches: seq[seq[tuple[x, y: Matrix[float]]]] + for epoch in 0..