Initial ground work for neural network complete

2023-03-21 16:44:08 +01:00 · 2023-03-21 16:44:08 +01:00 · a97cec41a6
parent 9506c554ec
commit a97cec41a6
3 changed files with 134 additions and 19 deletions
--- a/src/main.nim
+++ b/src/main.nim
@ -2,9 +2,9 @@ import nn/network
 import nn/util/matrix
-var mlp = newNeuralNetwork(@[newDenseLayer(2, 3, Sigmoid), 
+var mlp = newNeuralNetwork(@[newDenseLayer(784, 10, Sigmoid), 
-                             newDenseLayer(3, 2, Sigmoid),
+                             newDenseLayer(10, 16, Sigmoid),
-                             newDenseLayer(2, 3, Softmax)],
+                             newDenseLayer(16, 10, Softmax)],
                           lossFunc=MSE, learnRate=0.05, momentum=0.55,
-                           weightRange=(start: -1.0, stop: 1.0), biasRange=(start: -1.0, stop: 1.0))
+                           weightRange=(start: -1.0, stop: 1.0),
-echo mlp.feedforward(newMatrix[float](@[1.0, 2.0]))
+                           biasRange=(start: -1.0, stop: 1.0))
--- a/src/nn/network.nim
+++ b/src/nn/network.nim
@ -18,6 +18,7 @@ import util/matrix
 import std/strformat
 import std/random
 import std/math
 import std/sequtils
 randomize()
@ -128,22 +129,94 @@ proc newNeuralNetwork*(topology: seq[Layer], lossFunc: Loss, learnRate: float, m
    result.momentum = momentum
-proc feedforward*(self: NeuralNetwork, data: Matrix[float]): Matrix[float] =
+proc backprop(self: NeuralNetwork, x, y: Matrix[float]): tuple[weights, biases: seq[Matrix[float]]] =
-    ## Feeds the given input through the network and returns
+    ## Performs a single backpropagation step and returns the
-    ## a 1D array with the output
+    ## gradient of the cost function for the weights and biases 
-    when not defined(release):
+    ## of the network according to the given training sample
-        if data.shape.rows > 1:
+    var
-            raise newException(ValueError, "input data must be one-dimensional")
+        # The deltas for the weights and biases of
-        if data.shape.cols != self.layers[0].inputSize:
+        # each layer in the network
-            raise newException(ValueError, &"input is of the wrong shape (expecting (1, {self.layers[0].inputSize}), got ({data.shape.rows}, {data.shape.cols}) instead)")
+        deltaW: seq[Matrix[float]] = @[]
-    result = data
+        deltaB: seq[Matrix[float]] = @[]
        # Activations of each layer
        activation = x
        activations: seq[Matrix[float]] = @[x]
        # Unactivated outputs of each layer
        unactivated: seq[Matrix[float]] = @[]
    # Forward pass through the network
    for layer in self.layers:
-        result = layer.activation.function(layer.weights.dot(result) + layer.biases)
+        deltaW.add(zeros[float](layer.weights.shape))
        deltaB.add(zeros[float](layer.biases.shape))
        unactivated.add(layer.weights.dot(activation) + layer.biases)
        activations.add(layer.activation.function(unactivated[^1]))
    # Backwards pass
    # The negative gradient of each layer for this sample: this is a 
    # partial derivative, so the multiplication here is just an 
    # application of the chain rule!
    var diff: Matrix[float] = self.loss.derivative(activations[^1], y) * self.layers[^1].activation.derivative(unactivated[^1])
    deltaB[^1].replace(diff)
    deltaW[^1].replace(activations[^2].transpose())
    for l in 2..self.layers.high():
        # The ^ makes our indeces start from the back instead of
        # from the front, so we're really iterating over our layers
        # backwards!
        diff = self.layers[^l].weights.transpose.dot(diff) * self.layers[^l].activation.derivative(unactivated[^l])
        deltaB[^l].replace(diff)
        deltaW[^l].replace(diff.dot(activations[^(l - 1)].transpose()))
    return (deltaW, deltaB)
-proc backprop(self: NeuralNetwork, x, y: Matrix[float]) {.used.} =
+proc miniBatch(self: NeuralNetwork, data: seq[tuple[x, y: Matrix[float]]]) =
-    ## Performs a single backpropagation step and updates the
+    ## Performs a single mini-batch step in stochastic gradient
-    ## gradients for our weights and biases, layer by layer
+    ## descent and updates the network's weights and biases
    ## accordingly
    var gradient: tuple[weights, biases: seq[Matrix[float]]]
    # New weights and biases
    var 
        weights: seq[Matrix[float]] = @[]
        biases: seq[Matrix[float]] = @[]
    for layer in self.layers:
        weights.add(zeros[float](layer.weights.shape))
        biases.add(zeros[float](layer.biases.shape))
    for dataPoint in data:
        gradient = self.backprop(dataPoint.x, dataPoint.y)
        for i, (currentBiases, newBiases) in zip(biases, gradient.biases):
            biases[i] = currentBiases + newBiases
        for i, (currentWeights, newWeights) in zip(weights, gradient.weights):
            weights[i] = currentWeights + newWeights
    # The backpropagation algorithm lets us find the gradient of steepest ascent
    # in our cost function, so we subtract it from the current weights and biases
    # to descend it the fastest (it's not actually *the* fastest because true gradient
    # descent would perform this over all training samples, but it's a pretty good
    # approximation nonetheless, it converges quickly and it actually helps prevent
    # overfitting by not letting the network train over the same data over and over
    # again)
    for (layer, newBiases) in zip(self.layers, biases):
        layer.biases = layer.biases - (self.learnRate / data.len().float) * newBiases
    for (layer, newWeights) in zip(self.layers, weights):
        layer.weights = layer.weights - (self.learnRate / data.len().float) * newWeights
 proc train*(self: NeuralNetwork, epochs: int, batchSize: int, data: var seq[tuple[x, y: Matrix[float]]]) =
    ## Train the network on the given data for the speficied 
    ## number of epochs using the given batch size by applying
    ## stochastic gradient descent
    var batches: seq[seq[tuple[x, y: Matrix[float]]]]
    for epoch in 0..<epochs:
        # We shuffle the data so that different epochs work
        # on different data points. This will hopefully help
        # the network generalize its training onto unseen data
        shuffle(data)
        batches = @[]
        var i = 0
        while i < data.len():
            batches.add(@[])
            for j in 0..<batchSize:
                batches[^1].add(data[i])
            i += batchSize
        for batch in batches:
            self.miniBatch(batch)
 ## Utility functions
--- a/src/nn/util/matrix.nim
+++ b/src/nn/util/matrix.nim
@ -425,7 +425,6 @@ proc copy*[T](self: MatrixView[T]): Matrix[T] =
    for e in self:
        result.data[].add(e)
    result.shape = self.shape
    result.m = self.m
 proc dup*[T](self: MatrixView[T]): MatrixView[T] =
@ -462,6 +461,20 @@ proc `/`*[T](a: Matrix[T], b: T): Matrix[T] = a.copy().apply(divide, b, axis= -1
 proc `/`*[T](a: T, b: Matrix[T]): Matrix[T] = b.copy().apply(divide, a, axis= -1)
 proc `+`*[T](a: MatrixView[T], b: T): Matrix[T] = a.copy().apply(add, b, axis= -1)
 proc `+`*[T](a: T, b: MatrixView[T]): Matrix[T] = b.copy().apply(add, a, axis= -1)
 proc `-`*[T](a: MatrixView[T], b: T): Matrix[T] = a.copy().apply(sub, b, axis= -1)
 proc `-`*[T](a: T, b: MatrixView[T]): Matrix[T] = b.copy().apply(sub, a, axis= -1)
 proc `-`*[T](a: MatrixView[T]): Matrix[T] = a.copy().apply(neg, a, axis= -1)
 proc `*`*[T](a: MatrixView[T], b: T): Matrix[T] = a.copy().apply(mul, b, axis = -1)
 proc `*`*[T](a: T, b: MatrixView[T]): Matrix[T] = b.copy().apply(mul, a, axis= -1)
 proc `/`*[T](a: MatrixView[T], b: T): Matrix[T] = a.copy().apply(divide, b, axis= -1)
 proc `/`*[T](a: T, b: MatrixView[T]): Matrix[T] = b.copy().apply(divide, a, axis= -1)
 # matrix/matrix operations. They produce a new matrix with the
 # result of the operation
@ -942,6 +955,23 @@ proc dot*[T](self, other: Matrix[T]): Matrix[T] =
        return self * other
 proc dot*[T](self: MatrixView[T], other: Matrix[T]): Matrix[T] =
    ## Computes the dot product of the two
    ## input matrices
    when not defined(release):
        if self.shape.cols != other.shape.cols:
            raise newException(ValueError, &"incompatible argument shapes for dot product")
    result = zeros[T]((0, self.shape.rows))
    for i in 0..<result.shape.cols:
        result[0, i] = (other[0] * self[i]).sum()
 proc dot*[T](self: Matrix[T], other: MatrixView[T]): Matrix[T] {.inline.} = result = other.dot(self)
 proc dot*[T](self, other: MatrixView[T]): T = (self * other).sum()
 proc where*[T](cond: Matrix[bool], x, y: Matrix[T]): Matrix[T] =
    ## Return elements chosen from x or y depending on cond
    ## Where cond is true, take elements from x, otherwise
@ -1046,6 +1076,18 @@ proc count*[T](self: Matrix[T], e: T): int =
                inc(result) 
 proc replace*[T](self: Matrix[T], other: Matrix[T], copy: bool = false) =
    ## Replaces the data in self with the data from
    ## other (a copy is not performed unless copy equals
    ## true). A reference to the object is returned
    if copy:
        self.data[] = other.data[]
    else:
        self.data = other.data
    self.order = other.order
    self.shape = other.shape
 when isMainModule:
    import math