NNExperiments/src/nn/network.nim

329 lines
14 KiB
Nim

# Copyright 2023 Mattia Giambirtone & All Contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import ../util/matrix
import std/strformat
import std/random
import std/math
import std/sequtils
randomize()
type
NeuralNetwork* = ref object
## A generic feed-forward
## neural network
layers*: seq[Layer]
loss: Loss # The network's cost function
activation: Activation # The network's activation function
# The network's learn rate determines
# the amount of progress that is made
# at each step when performing gradient
# descent
learnRate*: float
# The momentum serves to speed up convergence
# time when performing SGD: the higher the output
# of the derivative of the cost function, the more
# we nudge our inputs for our next epoch
momentum*: float
Loss* = ref object
## A vectorized loss function and its derivative
function: proc (a, b: Matrix[float]): float
derivative: proc (x, y: Matrix[float]): Matrix[float]
Activation* = ref object
## A vectorized activation function and its
## derivative
function: proc (input: Matrix[float]): Matrix[float]
derivative: proc (x: Matrix[float]): Matrix[float]
LayerKind* = enum
## A layer enumeration
Dense, Dropout, Sparse
Layer* = ref object
## A generic neural network
## layer
kind*: LayerKind # TODO (add dropout and sparse layer!)
inputSize*: int # The number of inputs we process
outputSize*: int # The number of outputs we produce
weights*: Matrix[float] # The weights for each connection (2D)
biases*: Matrix[float] # The biases for each neuron (1D)
proc `$`*(self: Layer): string =
## Returns a string representation
## of the layer
result = &"Layer(inputs={self.inputSize}, outputs={self.outputSize})"
proc `$`*(self: NeuralNetwork): string =
## Returns a string representation
## of the network
result = &"NeuralNetwork(learnRate={self.learnRate}, layers={self.layers})"
proc newLoss*(function: proc (a, b: Matrix[float]): float, derivative: proc (x, y: Matrix[float]): Matrix[float]): Loss =
## Creates a new Loss object
new(result)
result.function = function
result.derivative = derivative
proc newActivation*(function: proc (input: Matrix[float]): Matrix[float], derivative: proc (x: Matrix[float]): Matrix[float]): Activation =
## Creates a new Activation object
new(result)
result.function = function
result.derivative = derivative
proc newDenseLayer*(inputSize: int, outputSize: int): Layer =
## Creates a new dense layer with inputSize input
## parameters and outputSize outgoing outputs.
new(result)
result.inputSize = inputSize
result.outputSize = outputSize
result.kind = Dense
proc newNeuralNetwork*(topology: seq[Layer], lossFunc: Loss, activationFunc: Activation,
learnRate: float, momentum: float, weightRange,
biasRange: tuple[start, stop: float]): NeuralNetwork =
## Initializes a new neural network with
## the given topology and hyperparameters.
## Weights and biases are initialized with
## random values in the chosen range using
## nim's default PRNG
new(result)
result.layers = topology
for layer in result.layers:
var biases = newSeqOfCap[float](layer.outputSize)
for _ in 0..<layer.outputSize:
biases.add(rand(biasRange.start..biasRange.stop))
var weights = newSeqOfCap[float](layer.inputSize * layer.outputSize)
for _ in 0..<layer.outputSize:
for _ in 0..<layer.inputSize:
weights.add(rand(weightRange.start..weightRange.stop))
layer.biases = newMatrix[float](biases)
# Why swap outputSize and inputSize in the matrix shape? The reason is simple: this
# spares us from having to transpose it later when we perform the dot product (I get
# that it's a constant time operation, but if we can avoid it altogether, that's even
# better!)
layer.weights = newMatrixFromSeq[float](weights, (layer.outputSize, layer.inputSize))
result.loss = lossFunc
result.activation = activationFunc
result.learnRate = learnRate
result.momentum = momentum
proc feed(self: Layer, x: Matrix[float]): Matrix[float] =
## Feeds the given input to the layer.
## The layer's output is returned
result = self.weights.dot(x) + self.biases
proc fastFeedForward(self: NeuralNetwork, x: Matrix[float]): Matrix[float] {.used.} =
## Feeds the given input through the network. The
## (unactivated) output from the last layer is returned
result = x
for layer in self.layers:
result = layer.feed(result)
proc feedForward(self: NeuralNetwork, x: Matrix[float]): seq[Matrix[float]] =
## Feeds the given input through the network.
## All unactivated outputs from each layer are
## returned in order
result.add(x)
for layer in self.layers:
result.add(layer.feed(result[^1]))
# TODO: Consider optimizing this to take n m-length vectors in an m*n matrix instead
# of calling it with every sample in every mini-batch to offload the heavylifting
# to the matrix library (this hasn't been done yet, both for simplicity purposes and
# also because the matrix library we use is horribly inefficient anyway)
proc backprop(self: NeuralNetwork, x, y: Matrix[float]): tuple[weights, biases: seq[Matrix[float]]] =
## Performs a single backpropagation step with the given
## training sample and returns the direction of steepest
## ascent for the gradient of the network's cost function
## w.r.t the weights and biases
var
# The deltas for the weights and biases of
# each layer in the network
deltaW: seq[Matrix[float]] = @[]
deltaB: seq[Matrix[float]] = @[]
# Activations of each layer
activations: seq[Matrix[float]] = @[]
# Unactivated outputs of each layer
unactivated: seq[Matrix[float]] = @[]
# Initialize all of our deltas to zero
for layer in self.layers:
deltaW.add(zeros[float](layer.weights.shape))
deltaB.add(zeros[float](layer.biases.shape))
# Forward pass through the network
unactivated = self.feedForward(x)
# TODO: This can probably be optimized
for unact in unactivated:
activations.add(self.activation.function(unact))
# This stores the gradient of each layer for this sample: since it is a
# partial derivative the multiplication here is just an application of the
# chain rule (Because while the cost function does indeed depend on the
# weights and biases, they aren't explicit arguments to it, which means we
# have to do fancy calculus stuff to figure out the derivative)
var diff: Matrix[float] = self.loss.derivative(activations[^1], y) * self.activation.derivative(unactivated[^1])
deltaB[^1].replace(diff)
deltaW[^1].replace(activations[^2].transpose())
# Backwards pass (actually the backwards pass began two lines earlier, we're just feeding
# the correction back through the rest of the network now)
for l in 1..<self.layers.high():
diff = self.layers[^l].weights.transpose().dot(diff) * self.activation.derivative(unactivated[^l])
deltaB[^l].replace(diff)
deltaW[^l].replace(diff.dot(activations[^l].transpose()))
return (deltaW, deltaB)
proc miniBatch(self: NeuralNetwork, data: seq[tuple[x, y: Matrix[float]]]) =
## Performs a single mini-batch step in stochastic gradient
## descent and updates the network's weights and biases
## accordingly
var gradient: tuple[weights, biases: seq[Matrix[float]]]
# New weights and biases
var
weights: seq[Matrix[float]] = @[]
biases: seq[Matrix[float]] = @[]
for layer in self.layers:
weights.add(zeros[float](layer.weights.shape))
biases.add(zeros[float](layer.biases.shape))
for dataPoint in data:
gradient = self.backprop(dataPoint.x, dataPoint.y)
for i, (currentBiases, newBiases) in zip(biases, gradient.biases):
biases[i] = currentBiases + newBiases
for i, (currentWeights, newWeights) in zip(weights, gradient.weights):
weights[i] = currentWeights + newWeights
# We use hyperparameters such as the learn rate and momentum
# to further control how fast (or slowly) the network converges
# onto a local minimum of the gradient of our loss function. To
# be completely honest I'm not entirely sure why we're dividing the
# learn rate by the size of our batch (if you didn't already notice I
# stole a lot of this code. I swear I'm a good programmer. Please hire
# me): my best guess would be that this way it gets "normalized" (as
# if we were training on the entire dataset at once even though we
# aren't) when it's < 1 and are otherwise scaling it to the size of
# our batch when it's > 1. I have some vague ideas as to why that may
# make sense, but it's a wild guess really
var nudge = self.learnRate / data.len().float
if self.momentum > 0:
# I _could_ go look at how other libraries implement
# momentum, OR I could pull a formula out of my ass
# and hope it works. Let's run with that, hm?
nudge *= (1 / self.momentum)
# The backpropagation algorithm lets us find the direction of steepest ascent
# in the gradient of our cost function (which, remember, we're trying to minimize
# by climbing it down), so we subtract that from the current weights and biases
# to descend it the fastest (it's not actually *the* fastest because true gradient
# descent would perform this over all training samples, but it's a pretty good
# approximation nonetheless, it converges quickly and it actually helps prevent
# overfitting by not letting the network train over the same data over and over
# again)
for (layer, newBiases) in zip(self.layers, biases):
layer.biases = (layer.biases - nudge) * newBiases
for (layer, newWeights) in zip(self.layers, weights):
layer.weights = (layer.weights - nudge) * newWeights
proc train*(self: NeuralNetwork, epochs: int, batchSize: int, data: var seq[tuple[x, y: Matrix[float]]]) =
## Train the network on the given data for the speficied
## number of epochs using the given batch size by applying
## stochastic gradient descent
var batches: seq[seq[tuple[x, y: Matrix[float]]]]
for epoch in 0..<epochs:
# We shuffle the data so that different epochs work
# on different data points. This will hopefully help
# the network generalize its training onto unseen data
shuffle(data)
batches = @[]
var i = 0
while i < data.len():
batches.add(@[])
for j in 0..<batchSize:
batches[^1].add(data[i])
i += batchSize
for batch in batches:
self.miniBatch(batch)
echo &"Completed training epoch {epoch + 1}"
## Utility functions
# Mean squared error
proc mse(a, b: Matrix[float]): float =
result = (b - a).apply(proc (x: float): float = pow(x, 2), axis = -1).sum() / len(a).float
# Derivative of MSE
func dxMSE(x, y: Matrix[float]): Matrix[float] = 2.0 * (x - y)
func sigmoid(x: float): float = 1 / (1 + exp(-x))
# A bunch of vectorized activation functions
proc sigmoid(input: Matrix[float]): Matrix[float] =
result = input.apply(sigmoid, axis = -1)
proc sigmoidDerivative(input: Matrix[float]): Matrix[float] = sigmoid(input) * (1.0 - sigmoid(input))
proc softmax(input: Matrix[float]): Matrix[float] =
# This is the good kind of softmax (stole it from
# stackoverflow lol) which means it doesn't violently
# detonate if the input gets too large because
# of the exponentials. I love the internet!
var input = input - input.max()
result = input.apply(math.exp, axis = -1) / input.apply(math.exp, axis = -1).sum()
proc softmaxDerivative(input: Matrix[float]): Matrix[float] =
# I stole this too, by the way
var input = input.reshape(input.shape.cols, 1)
# I _love_ stealing functions from numpy!
result = input.diagflat() - input.dot(input.transpose())
proc relu(input: Matrix[float]): Matrix[float] = input.apply(proc (x: float): float = max(0.0, x), axis = -1)
proc dxRelu(input: Matrix[float]): Matrix[float] = where(input > 0.0, ones[float](input.shape), 0)
proc silu(input: Matrix[float]): Matrix[float] = input.apply(proc (x: float): float = x * sigmoid(x), axis= -1)
proc dSilu(input: Matrix[float]): Matrix[float] = input.apply(proc (x: float): float = sigmoid(x) * (1 + x * (1 - sigmoid(x))), axis = -1)
proc htan(input: Matrix[float]): Matrix[float] =
let f = proc (x: float): float =
let temp = exp(2 * x)
result = (temp - 1) / (temp + 1)
input.apply(f, axis = -1)
proc htanDx(input: Matrix[float]): Matrix[float] = input.apply(proc (x: float): float = 1 - (pow(tanh(x), 2)), axis = -1)
{.push.}
{.hints: off.} # So nim doesn't complain about the naming
var Sigmoid* = newActivation(sigmoid, sigmoidDerivative)
var Softmax* = newActivation(softmax, softmaxDerivative)
var ReLU* = newActivation(relu, dxRelu)
var SiLU* = newActivation(silu, dSilu)
var HTan* = newActivation(htan, htanDx)
var MSE* = newLoss(mse, dxMSE)
{.pop.}