Initial ground work for neural network complete
This commit is contained in:
parent
9506c554ec
commit
a97cec41a6
10
src/main.nim
10
src/main.nim
|
@ -2,9 +2,9 @@ import nn/network
|
||||||
import nn/util/matrix
|
import nn/util/matrix
|
||||||
|
|
||||||
|
|
||||||
var mlp = newNeuralNetwork(@[newDenseLayer(2, 3, Sigmoid),
|
var mlp = newNeuralNetwork(@[newDenseLayer(784, 10, Sigmoid),
|
||||||
newDenseLayer(3, 2, Sigmoid),
|
newDenseLayer(10, 16, Sigmoid),
|
||||||
newDenseLayer(2, 3, Softmax)],
|
newDenseLayer(16, 10, Softmax)],
|
||||||
lossFunc=MSE, learnRate=0.05, momentum=0.55,
|
lossFunc=MSE, learnRate=0.05, momentum=0.55,
|
||||||
weightRange=(start: -1.0, stop: 1.0), biasRange=(start: -1.0, stop: 1.0))
|
weightRange=(start: -1.0, stop: 1.0),
|
||||||
echo mlp.feedforward(newMatrix[float](@[1.0, 2.0]))
|
biasRange=(start: -1.0, stop: 1.0))
|
||||||
|
|
|
@ -18,6 +18,7 @@ import util/matrix
|
||||||
import std/strformat
|
import std/strformat
|
||||||
import std/random
|
import std/random
|
||||||
import std/math
|
import std/math
|
||||||
|
import std/sequtils
|
||||||
|
|
||||||
|
|
||||||
randomize()
|
randomize()
|
||||||
|
@ -128,22 +129,94 @@ proc newNeuralNetwork*(topology: seq[Layer], lossFunc: Loss, learnRate: float, m
|
||||||
result.momentum = momentum
|
result.momentum = momentum
|
||||||
|
|
||||||
|
|
||||||
proc feedforward*(self: NeuralNetwork, data: Matrix[float]): Matrix[float] =
|
proc backprop(self: NeuralNetwork, x, y: Matrix[float]): tuple[weights, biases: seq[Matrix[float]]] =
|
||||||
## Feeds the given input through the network and returns
|
## Performs a single backpropagation step and returns the
|
||||||
## a 1D array with the output
|
## gradient of the cost function for the weights and biases
|
||||||
when not defined(release):
|
## of the network according to the given training sample
|
||||||
if data.shape.rows > 1:
|
var
|
||||||
raise newException(ValueError, "input data must be one-dimensional")
|
# The deltas for the weights and biases of
|
||||||
if data.shape.cols != self.layers[0].inputSize:
|
# each layer in the network
|
||||||
raise newException(ValueError, &"input is of the wrong shape (expecting (1, {self.layers[0].inputSize}), got ({data.shape.rows}, {data.shape.cols}) instead)")
|
deltaW: seq[Matrix[float]] = @[]
|
||||||
result = data
|
deltaB: seq[Matrix[float]] = @[]
|
||||||
|
# Activations of each layer
|
||||||
|
activation = x
|
||||||
|
activations: seq[Matrix[float]] = @[x]
|
||||||
|
# Unactivated outputs of each layer
|
||||||
|
unactivated: seq[Matrix[float]] = @[]
|
||||||
|
# Forward pass through the network
|
||||||
for layer in self.layers:
|
for layer in self.layers:
|
||||||
result = layer.activation.function(layer.weights.dot(result) + layer.biases)
|
deltaW.add(zeros[float](layer.weights.shape))
|
||||||
|
deltaB.add(zeros[float](layer.biases.shape))
|
||||||
|
unactivated.add(layer.weights.dot(activation) + layer.biases)
|
||||||
|
activations.add(layer.activation.function(unactivated[^1]))
|
||||||
|
# Backwards pass
|
||||||
|
|
||||||
|
# The negative gradient of each layer for this sample: this is a
|
||||||
|
# partial derivative, so the multiplication here is just an
|
||||||
|
# application of the chain rule!
|
||||||
|
var diff: Matrix[float] = self.loss.derivative(activations[^1], y) * self.layers[^1].activation.derivative(unactivated[^1])
|
||||||
|
deltaB[^1].replace(diff)
|
||||||
|
deltaW[^1].replace(activations[^2].transpose())
|
||||||
|
for l in 2..self.layers.high():
|
||||||
|
# The ^ makes our indeces start from the back instead of
|
||||||
|
# from the front, so we're really iterating over our layers
|
||||||
|
# backwards!
|
||||||
|
diff = self.layers[^l].weights.transpose.dot(diff) * self.layers[^l].activation.derivative(unactivated[^l])
|
||||||
|
deltaB[^l].replace(diff)
|
||||||
|
deltaW[^l].replace(diff.dot(activations[^(l - 1)].transpose()))
|
||||||
|
return (deltaW, deltaB)
|
||||||
|
|
||||||
|
|
||||||
proc backprop(self: NeuralNetwork, x, y: Matrix[float]) {.used.} =
|
proc miniBatch(self: NeuralNetwork, data: seq[tuple[x, y: Matrix[float]]]) =
|
||||||
## Performs a single backpropagation step and updates the
|
## Performs a single mini-batch step in stochastic gradient
|
||||||
## gradients for our weights and biases, layer by layer
|
## descent and updates the network's weights and biases
|
||||||
|
## accordingly
|
||||||
|
var gradient: tuple[weights, biases: seq[Matrix[float]]]
|
||||||
|
# New weights and biases
|
||||||
|
var
|
||||||
|
weights: seq[Matrix[float]] = @[]
|
||||||
|
biases: seq[Matrix[float]] = @[]
|
||||||
|
for layer in self.layers:
|
||||||
|
weights.add(zeros[float](layer.weights.shape))
|
||||||
|
biases.add(zeros[float](layer.biases.shape))
|
||||||
|
for dataPoint in data:
|
||||||
|
gradient = self.backprop(dataPoint.x, dataPoint.y)
|
||||||
|
for i, (currentBiases, newBiases) in zip(biases, gradient.biases):
|
||||||
|
biases[i] = currentBiases + newBiases
|
||||||
|
for i, (currentWeights, newWeights) in zip(weights, gradient.weights):
|
||||||
|
weights[i] = currentWeights + newWeights
|
||||||
|
# The backpropagation algorithm lets us find the gradient of steepest ascent
|
||||||
|
# in our cost function, so we subtract it from the current weights and biases
|
||||||
|
# to descend it the fastest (it's not actually *the* fastest because true gradient
|
||||||
|
# descent would perform this over all training samples, but it's a pretty good
|
||||||
|
# approximation nonetheless, it converges quickly and it actually helps prevent
|
||||||
|
# overfitting by not letting the network train over the same data over and over
|
||||||
|
# again)
|
||||||
|
for (layer, newBiases) in zip(self.layers, biases):
|
||||||
|
layer.biases = layer.biases - (self.learnRate / data.len().float) * newBiases
|
||||||
|
for (layer, newWeights) in zip(self.layers, weights):
|
||||||
|
layer.weights = layer.weights - (self.learnRate / data.len().float) * newWeights
|
||||||
|
|
||||||
|
|
||||||
|
proc train*(self: NeuralNetwork, epochs: int, batchSize: int, data: var seq[tuple[x, y: Matrix[float]]]) =
|
||||||
|
## Train the network on the given data for the speficied
|
||||||
|
## number of epochs using the given batch size by applying
|
||||||
|
## stochastic gradient descent
|
||||||
|
var batches: seq[seq[tuple[x, y: Matrix[float]]]]
|
||||||
|
for epoch in 0..<epochs:
|
||||||
|
# We shuffle the data so that different epochs work
|
||||||
|
# on different data points. This will hopefully help
|
||||||
|
# the network generalize its training onto unseen data
|
||||||
|
shuffle(data)
|
||||||
|
batches = @[]
|
||||||
|
var i = 0
|
||||||
|
while i < data.len():
|
||||||
|
batches.add(@[])
|
||||||
|
for j in 0..<batchSize:
|
||||||
|
batches[^1].add(data[i])
|
||||||
|
i += batchSize
|
||||||
|
for batch in batches:
|
||||||
|
self.miniBatch(batch)
|
||||||
|
|
||||||
|
|
||||||
## Utility functions
|
## Utility functions
|
||||||
|
|
|
@ -425,7 +425,6 @@ proc copy*[T](self: MatrixView[T]): Matrix[T] =
|
||||||
for e in self:
|
for e in self:
|
||||||
result.data[].add(e)
|
result.data[].add(e)
|
||||||
result.shape = self.shape
|
result.shape = self.shape
|
||||||
result.m = self.m
|
|
||||||
|
|
||||||
|
|
||||||
proc dup*[T](self: MatrixView[T]): MatrixView[T] =
|
proc dup*[T](self: MatrixView[T]): MatrixView[T] =
|
||||||
|
@ -462,6 +461,20 @@ proc `/`*[T](a: Matrix[T], b: T): Matrix[T] = a.copy().apply(divide, b, axis= -1
|
||||||
proc `/`*[T](a: T, b: Matrix[T]): Matrix[T] = b.copy().apply(divide, a, axis= -1)
|
proc `/`*[T](a: T, b: Matrix[T]): Matrix[T] = b.copy().apply(divide, a, axis= -1)
|
||||||
|
|
||||||
|
|
||||||
|
proc `+`*[T](a: MatrixView[T], b: T): Matrix[T] = a.copy().apply(add, b, axis= -1)
|
||||||
|
proc `+`*[T](a: T, b: MatrixView[T]): Matrix[T] = b.copy().apply(add, a, axis= -1)
|
||||||
|
|
||||||
|
proc `-`*[T](a: MatrixView[T], b: T): Matrix[T] = a.copy().apply(sub, b, axis= -1)
|
||||||
|
proc `-`*[T](a: T, b: MatrixView[T]): Matrix[T] = b.copy().apply(sub, a, axis= -1)
|
||||||
|
proc `-`*[T](a: MatrixView[T]): Matrix[T] = a.copy().apply(neg, a, axis= -1)
|
||||||
|
|
||||||
|
proc `*`*[T](a: MatrixView[T], b: T): Matrix[T] = a.copy().apply(mul, b, axis = -1)
|
||||||
|
proc `*`*[T](a: T, b: MatrixView[T]): Matrix[T] = b.copy().apply(mul, a, axis= -1)
|
||||||
|
|
||||||
|
proc `/`*[T](a: MatrixView[T], b: T): Matrix[T] = a.copy().apply(divide, b, axis= -1)
|
||||||
|
proc `/`*[T](a: T, b: MatrixView[T]): Matrix[T] = b.copy().apply(divide, a, axis= -1)
|
||||||
|
|
||||||
|
|
||||||
# matrix/matrix operations. They produce a new matrix with the
|
# matrix/matrix operations. They produce a new matrix with the
|
||||||
# result of the operation
|
# result of the operation
|
||||||
|
|
||||||
|
@ -942,6 +955,23 @@ proc dot*[T](self, other: Matrix[T]): Matrix[T] =
|
||||||
return self * other
|
return self * other
|
||||||
|
|
||||||
|
|
||||||
|
proc dot*[T](self: MatrixView[T], other: Matrix[T]): Matrix[T] =
|
||||||
|
## Computes the dot product of the two
|
||||||
|
## input matrices
|
||||||
|
when not defined(release):
|
||||||
|
if self.shape.cols != other.shape.cols:
|
||||||
|
raise newException(ValueError, &"incompatible argument shapes for dot product")
|
||||||
|
result = zeros[T]((0, self.shape.rows))
|
||||||
|
for i in 0..<result.shape.cols:
|
||||||
|
result[0, i] = (other[0] * self[i]).sum()
|
||||||
|
|
||||||
|
|
||||||
|
proc dot*[T](self: Matrix[T], other: MatrixView[T]): Matrix[T] {.inline.} = result = other.dot(self)
|
||||||
|
|
||||||
|
|
||||||
|
proc dot*[T](self, other: MatrixView[T]): T = (self * other).sum()
|
||||||
|
|
||||||
|
|
||||||
proc where*[T](cond: Matrix[bool], x, y: Matrix[T]): Matrix[T] =
|
proc where*[T](cond: Matrix[bool], x, y: Matrix[T]): Matrix[T] =
|
||||||
## Return elements chosen from x or y depending on cond
|
## Return elements chosen from x or y depending on cond
|
||||||
## Where cond is true, take elements from x, otherwise
|
## Where cond is true, take elements from x, otherwise
|
||||||
|
@ -1046,6 +1076,18 @@ proc count*[T](self: Matrix[T], e: T): int =
|
||||||
inc(result)
|
inc(result)
|
||||||
|
|
||||||
|
|
||||||
|
proc replace*[T](self: Matrix[T], other: Matrix[T], copy: bool = false) =
|
||||||
|
## Replaces the data in self with the data from
|
||||||
|
## other (a copy is not performed unless copy equals
|
||||||
|
## true). A reference to the object is returned
|
||||||
|
if copy:
|
||||||
|
self.data[] = other.data[]
|
||||||
|
else:
|
||||||
|
self.data = other.data
|
||||||
|
self.order = other.order
|
||||||
|
self.shape = other.shape
|
||||||
|
|
||||||
|
|
||||||
when isMainModule:
|
when isMainModule:
|
||||||
import math
|
import math
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue