Initial ground work for neural network complete
This commit is contained in:
parent
9506c554ec
commit
a97cec41a6
10
src/main.nim
10
src/main.nim
|
@ -2,9 +2,9 @@ import nn/network
|
|||
import nn/util/matrix
|
||||
|
||||
|
||||
var mlp = newNeuralNetwork(@[newDenseLayer(2, 3, Sigmoid),
|
||||
newDenseLayer(3, 2, Sigmoid),
|
||||
newDenseLayer(2, 3, Softmax)],
|
||||
var mlp = newNeuralNetwork(@[newDenseLayer(784, 10, Sigmoid),
|
||||
newDenseLayer(10, 16, Sigmoid),
|
||||
newDenseLayer(16, 10, Softmax)],
|
||||
lossFunc=MSE, learnRate=0.05, momentum=0.55,
|
||||
weightRange=(start: -1.0, stop: 1.0), biasRange=(start: -1.0, stop: 1.0))
|
||||
echo mlp.feedforward(newMatrix[float](@[1.0, 2.0]))
|
||||
weightRange=(start: -1.0, stop: 1.0),
|
||||
biasRange=(start: -1.0, stop: 1.0))
|
||||
|
|
|
@ -18,6 +18,7 @@ import util/matrix
|
|||
import std/strformat
|
||||
import std/random
|
||||
import std/math
|
||||
import std/sequtils
|
||||
|
||||
|
||||
randomize()
|
||||
|
@ -128,22 +129,94 @@ proc newNeuralNetwork*(topology: seq[Layer], lossFunc: Loss, learnRate: float, m
|
|||
result.momentum = momentum
|
||||
|
||||
|
||||
proc feedforward*(self: NeuralNetwork, data: Matrix[float]): Matrix[float] =
|
||||
## Feeds the given input through the network and returns
|
||||
## a 1D array with the output
|
||||
when not defined(release):
|
||||
if data.shape.rows > 1:
|
||||
raise newException(ValueError, "input data must be one-dimensional")
|
||||
if data.shape.cols != self.layers[0].inputSize:
|
||||
raise newException(ValueError, &"input is of the wrong shape (expecting (1, {self.layers[0].inputSize}), got ({data.shape.rows}, {data.shape.cols}) instead)")
|
||||
result = data
|
||||
proc backprop(self: NeuralNetwork, x, y: Matrix[float]): tuple[weights, biases: seq[Matrix[float]]] =
|
||||
## Performs a single backpropagation step and returns the
|
||||
## gradient of the cost function for the weights and biases
|
||||
## of the network according to the given training sample
|
||||
var
|
||||
# The deltas for the weights and biases of
|
||||
# each layer in the network
|
||||
deltaW: seq[Matrix[float]] = @[]
|
||||
deltaB: seq[Matrix[float]] = @[]
|
||||
# Activations of each layer
|
||||
activation = x
|
||||
activations: seq[Matrix[float]] = @[x]
|
||||
# Unactivated outputs of each layer
|
||||
unactivated: seq[Matrix[float]] = @[]
|
||||
# Forward pass through the network
|
||||
for layer in self.layers:
|
||||
result = layer.activation.function(layer.weights.dot(result) + layer.biases)
|
||||
deltaW.add(zeros[float](layer.weights.shape))
|
||||
deltaB.add(zeros[float](layer.biases.shape))
|
||||
unactivated.add(layer.weights.dot(activation) + layer.biases)
|
||||
activations.add(layer.activation.function(unactivated[^1]))
|
||||
# Backwards pass
|
||||
|
||||
# The negative gradient of each layer for this sample: this is a
|
||||
# partial derivative, so the multiplication here is just an
|
||||
# application of the chain rule!
|
||||
var diff: Matrix[float] = self.loss.derivative(activations[^1], y) * self.layers[^1].activation.derivative(unactivated[^1])
|
||||
deltaB[^1].replace(diff)
|
||||
deltaW[^1].replace(activations[^2].transpose())
|
||||
for l in 2..self.layers.high():
|
||||
# The ^ makes our indeces start from the back instead of
|
||||
# from the front, so we're really iterating over our layers
|
||||
# backwards!
|
||||
diff = self.layers[^l].weights.transpose.dot(diff) * self.layers[^l].activation.derivative(unactivated[^l])
|
||||
deltaB[^l].replace(diff)
|
||||
deltaW[^l].replace(diff.dot(activations[^(l - 1)].transpose()))
|
||||
return (deltaW, deltaB)
|
||||
|
||||
|
||||
proc backprop(self: NeuralNetwork, x, y: Matrix[float]) {.used.} =
|
||||
## Performs a single backpropagation step and updates the
|
||||
## gradients for our weights and biases, layer by layer
|
||||
proc miniBatch(self: NeuralNetwork, data: seq[tuple[x, y: Matrix[float]]]) =
|
||||
## Performs a single mini-batch step in stochastic gradient
|
||||
## descent and updates the network's weights and biases
|
||||
## accordingly
|
||||
var gradient: tuple[weights, biases: seq[Matrix[float]]]
|
||||
# New weights and biases
|
||||
var
|
||||
weights: seq[Matrix[float]] = @[]
|
||||
biases: seq[Matrix[float]] = @[]
|
||||
for layer in self.layers:
|
||||
weights.add(zeros[float](layer.weights.shape))
|
||||
biases.add(zeros[float](layer.biases.shape))
|
||||
for dataPoint in data:
|
||||
gradient = self.backprop(dataPoint.x, dataPoint.y)
|
||||
for i, (currentBiases, newBiases) in zip(biases, gradient.biases):
|
||||
biases[i] = currentBiases + newBiases
|
||||
for i, (currentWeights, newWeights) in zip(weights, gradient.weights):
|
||||
weights[i] = currentWeights + newWeights
|
||||
# The backpropagation algorithm lets us find the gradient of steepest ascent
|
||||
# in our cost function, so we subtract it from the current weights and biases
|
||||
# to descend it the fastest (it's not actually *the* fastest because true gradient
|
||||
# descent would perform this over all training samples, but it's a pretty good
|
||||
# approximation nonetheless, it converges quickly and it actually helps prevent
|
||||
# overfitting by not letting the network train over the same data over and over
|
||||
# again)
|
||||
for (layer, newBiases) in zip(self.layers, biases):
|
||||
layer.biases = layer.biases - (self.learnRate / data.len().float) * newBiases
|
||||
for (layer, newWeights) in zip(self.layers, weights):
|
||||
layer.weights = layer.weights - (self.learnRate / data.len().float) * newWeights
|
||||
|
||||
|
||||
proc train*(self: NeuralNetwork, epochs: int, batchSize: int, data: var seq[tuple[x, y: Matrix[float]]]) =
|
||||
## Train the network on the given data for the speficied
|
||||
## number of epochs using the given batch size by applying
|
||||
## stochastic gradient descent
|
||||
var batches: seq[seq[tuple[x, y: Matrix[float]]]]
|
||||
for epoch in 0..<epochs:
|
||||
# We shuffle the data so that different epochs work
|
||||
# on different data points. This will hopefully help
|
||||
# the network generalize its training onto unseen data
|
||||
shuffle(data)
|
||||
batches = @[]
|
||||
var i = 0
|
||||
while i < data.len():
|
||||
batches.add(@[])
|
||||
for j in 0..<batchSize:
|
||||
batches[^1].add(data[i])
|
||||
i += batchSize
|
||||
for batch in batches:
|
||||
self.miniBatch(batch)
|
||||
|
||||
|
||||
## Utility functions
|
||||
|
|
|
@ -425,7 +425,6 @@ proc copy*[T](self: MatrixView[T]): Matrix[T] =
|
|||
for e in self:
|
||||
result.data[].add(e)
|
||||
result.shape = self.shape
|
||||
result.m = self.m
|
||||
|
||||
|
||||
proc dup*[T](self: MatrixView[T]): MatrixView[T] =
|
||||
|
@ -462,6 +461,20 @@ proc `/`*[T](a: Matrix[T], b: T): Matrix[T] = a.copy().apply(divide, b, axis= -1
|
|||
proc `/`*[T](a: T, b: Matrix[T]): Matrix[T] = b.copy().apply(divide, a, axis= -1)
|
||||
|
||||
|
||||
proc `+`*[T](a: MatrixView[T], b: T): Matrix[T] = a.copy().apply(add, b, axis= -1)
|
||||
proc `+`*[T](a: T, b: MatrixView[T]): Matrix[T] = b.copy().apply(add, a, axis= -1)
|
||||
|
||||
proc `-`*[T](a: MatrixView[T], b: T): Matrix[T] = a.copy().apply(sub, b, axis= -1)
|
||||
proc `-`*[T](a: T, b: MatrixView[T]): Matrix[T] = b.copy().apply(sub, a, axis= -1)
|
||||
proc `-`*[T](a: MatrixView[T]): Matrix[T] = a.copy().apply(neg, a, axis= -1)
|
||||
|
||||
proc `*`*[T](a: MatrixView[T], b: T): Matrix[T] = a.copy().apply(mul, b, axis = -1)
|
||||
proc `*`*[T](a: T, b: MatrixView[T]): Matrix[T] = b.copy().apply(mul, a, axis= -1)
|
||||
|
||||
proc `/`*[T](a: MatrixView[T], b: T): Matrix[T] = a.copy().apply(divide, b, axis= -1)
|
||||
proc `/`*[T](a: T, b: MatrixView[T]): Matrix[T] = b.copy().apply(divide, a, axis= -1)
|
||||
|
||||
|
||||
# matrix/matrix operations. They produce a new matrix with the
|
||||
# result of the operation
|
||||
|
||||
|
@ -942,6 +955,23 @@ proc dot*[T](self, other: Matrix[T]): Matrix[T] =
|
|||
return self * other
|
||||
|
||||
|
||||
proc dot*[T](self: MatrixView[T], other: Matrix[T]): Matrix[T] =
|
||||
## Computes the dot product of the two
|
||||
## input matrices
|
||||
when not defined(release):
|
||||
if self.shape.cols != other.shape.cols:
|
||||
raise newException(ValueError, &"incompatible argument shapes for dot product")
|
||||
result = zeros[T]((0, self.shape.rows))
|
||||
for i in 0..<result.shape.cols:
|
||||
result[0, i] = (other[0] * self[i]).sum()
|
||||
|
||||
|
||||
proc dot*[T](self: Matrix[T], other: MatrixView[T]): Matrix[T] {.inline.} = result = other.dot(self)
|
||||
|
||||
|
||||
proc dot*[T](self, other: MatrixView[T]): T = (self * other).sum()
|
||||
|
||||
|
||||
proc where*[T](cond: Matrix[bool], x, y: Matrix[T]): Matrix[T] =
|
||||
## Return elements chosen from x or y depending on cond
|
||||
## Where cond is true, take elements from x, otherwise
|
||||
|
@ -1046,6 +1076,18 @@ proc count*[T](self: Matrix[T], e: T): int =
|
|||
inc(result)
|
||||
|
||||
|
||||
proc replace*[T](self: Matrix[T], other: Matrix[T], copy: bool = false) =
|
||||
## Replaces the data in self with the data from
|
||||
## other (a copy is not performed unless copy equals
|
||||
## true). A reference to the object is returned
|
||||
if copy:
|
||||
self.data[] = other.data[]
|
||||
else:
|
||||
self.data = other.data
|
||||
self.order = other.order
|
||||
self.shape = other.shape
|
||||
|
||||
|
||||
when isMainModule:
|
||||
import math
|
||||
|
||||
|
|
Loading…
Reference in New Issue