329 lines
14 KiB
Nim
329 lines
14 KiB
Nim
# Copyright 2023 Mattia Giambirtone & All Contributors
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import ../util/matrix
|
|
|
|
|
|
import std/strformat
|
|
import std/random
|
|
import std/math
|
|
import std/sequtils
|
|
|
|
|
|
randomize()
|
|
|
|
|
|
type
|
|
NeuralNetwork* = ref object
|
|
## A generic feed-forward
|
|
## neural network
|
|
layers*: seq[Layer]
|
|
loss: Loss # The network's cost function
|
|
activation: Activation # The network's activation function
|
|
# The network's learn rate determines
|
|
# the amount of progress that is made
|
|
# at each step when performing gradient
|
|
# descent
|
|
learnRate*: float
|
|
# The momentum serves to speed up convergence
|
|
# time when performing SGD: the higher the output
|
|
# of the derivative of the cost function, the more
|
|
# we nudge our inputs for our next epoch
|
|
momentum*: float
|
|
Loss* = ref object
|
|
## A vectorized loss function and its derivative
|
|
function: proc (a, b: Matrix[float]): float
|
|
derivative: proc (x, y: Matrix[float]): Matrix[float]
|
|
Activation* = ref object
|
|
## A vectorized activation function and its
|
|
## derivative
|
|
function: proc (input: Matrix[float]): Matrix[float]
|
|
derivative: proc (x: Matrix[float]): Matrix[float]
|
|
LayerKind* = enum
|
|
## A layer enumeration
|
|
Dense, Dropout, Sparse
|
|
Layer* = ref object
|
|
## A generic neural network
|
|
## layer
|
|
kind*: LayerKind # TODO (add dropout and sparse layer!)
|
|
inputSize*: int # The number of inputs we process
|
|
outputSize*: int # The number of outputs we produce
|
|
weights*: Matrix[float] # The weights for each connection (2D)
|
|
biases*: Matrix[float] # The biases for each neuron (1D)
|
|
|
|
|
|
proc `$`*(self: Layer): string =
|
|
## Returns a string representation
|
|
## of the layer
|
|
result = &"Layer(inputs={self.inputSize}, outputs={self.outputSize})"
|
|
|
|
|
|
proc `$`*(self: NeuralNetwork): string =
|
|
## Returns a string representation
|
|
## of the network
|
|
result = &"NeuralNetwork(learnRate={self.learnRate}, layers={self.layers})"
|
|
|
|
|
|
proc newLoss*(function: proc (a, b: Matrix[float]): float, derivative: proc (x, y: Matrix[float]): Matrix[float]): Loss =
|
|
## Creates a new Loss object
|
|
new(result)
|
|
result.function = function
|
|
result.derivative = derivative
|
|
|
|
|
|
proc newActivation*(function: proc (input: Matrix[float]): Matrix[float], derivative: proc (x: Matrix[float]): Matrix[float]): Activation =
|
|
## Creates a new Activation object
|
|
new(result)
|
|
result.function = function
|
|
result.derivative = derivative
|
|
|
|
|
|
proc newDenseLayer*(inputSize: int, outputSize: int): Layer =
|
|
## Creates a new dense layer with inputSize input
|
|
## parameters and outputSize outgoing outputs.
|
|
new(result)
|
|
result.inputSize = inputSize
|
|
result.outputSize = outputSize
|
|
result.kind = Dense
|
|
|
|
|
|
proc newNeuralNetwork*(topology: seq[Layer], lossFunc: Loss, activationFunc: Activation,
|
|
learnRate: float, momentum: float, weightRange,
|
|
biasRange: tuple[start, stop: float]): NeuralNetwork =
|
|
## Initializes a new neural network with
|
|
## the given topology and hyperparameters.
|
|
## Weights and biases are initialized with
|
|
## random values in the chosen range using
|
|
## nim's default PRNG
|
|
new(result)
|
|
result.layers = topology
|
|
for layer in result.layers:
|
|
var biases = newSeqOfCap[float](layer.outputSize)
|
|
for _ in 0..<layer.outputSize:
|
|
biases.add(rand(biasRange.start..biasRange.stop))
|
|
var weights = newSeqOfCap[float](layer.inputSize * layer.outputSize)
|
|
for _ in 0..<layer.outputSize:
|
|
for _ in 0..<layer.inputSize:
|
|
weights.add(rand(weightRange.start..weightRange.stop))
|
|
layer.biases = newMatrix[float](biases)
|
|
# Why swap outputSize and inputSize in the matrix shape? The reason is simple: this
|
|
# spares us from having to transpose it later when we perform the dot product (I get
|
|
# that it's a constant time operation, but if we can avoid it altogether, that's even
|
|
# better!)
|
|
layer.weights = newMatrixFromSeq[float](weights, (layer.outputSize, layer.inputSize))
|
|
result.loss = lossFunc
|
|
result.activation = activationFunc
|
|
result.learnRate = learnRate
|
|
result.momentum = momentum
|
|
|
|
|
|
proc feed(self: Layer, x: Matrix[float]): Matrix[float] =
|
|
## Feeds the given input to the layer.
|
|
## The layer's output is returned
|
|
result = self.weights.dot(x) + self.biases
|
|
|
|
|
|
proc fastFeedForward(self: NeuralNetwork, x: Matrix[float]): Matrix[float] {.used.} =
|
|
## Feeds the given input through the network. The
|
|
## (unactivated) output from the last layer is returned
|
|
result = x
|
|
for layer in self.layers:
|
|
result = layer.feed(result)
|
|
|
|
|
|
proc feedForward(self: NeuralNetwork, x: Matrix[float]): seq[Matrix[float]] =
|
|
## Feeds the given input through the network.
|
|
## All unactivated outputs from each layer are
|
|
## returned in order
|
|
result.add(x)
|
|
for layer in self.layers:
|
|
result.add(layer.feed(result[^1]))
|
|
|
|
# TODO: Consider optimizing this to take n m-length vectors in an m*n matrix instead
|
|
# of calling it with every sample in every mini-batch to offload the heavylifting
|
|
# to the matrix library (this hasn't been done yet, both for simplicity purposes and
|
|
# also because the matrix library we use is horribly inefficient anyway)
|
|
proc backprop(self: NeuralNetwork, x, y: Matrix[float]): tuple[weights, biases: seq[Matrix[float]]] =
|
|
## Performs a single backpropagation step with the given
|
|
## training sample and returns the direction of steepest
|
|
## ascent for the gradient of the network's cost function
|
|
## w.r.t the weights and biases
|
|
var
|
|
# The deltas for the weights and biases of
|
|
# each layer in the network
|
|
deltaW: seq[Matrix[float]] = @[]
|
|
deltaB: seq[Matrix[float]] = @[]
|
|
# Activations of each layer
|
|
activations: seq[Matrix[float]] = @[]
|
|
# Unactivated outputs of each layer
|
|
unactivated: seq[Matrix[float]] = @[]
|
|
|
|
# Initialize all of our deltas to zero
|
|
for layer in self.layers:
|
|
deltaW.add(zeros[float](layer.weights.shape))
|
|
deltaB.add(zeros[float](layer.biases.shape))
|
|
|
|
# Forward pass through the network
|
|
unactivated = self.feedForward(x)
|
|
# TODO: This can probably be optimized
|
|
for unact in unactivated:
|
|
activations.add(self.activation.function(unact))
|
|
# This stores the gradient of each layer for this sample: since it is a
|
|
# partial derivative the multiplication here is just an application of the
|
|
# chain rule (Because while the cost function does indeed depend on the
|
|
# weights and biases, they aren't explicit arguments to it, which means we
|
|
# have to do fancy calculus stuff to figure out the derivative)
|
|
var diff: Matrix[float] = self.loss.derivative(activations[^1], y) * self.activation.derivative(unactivated[^1])
|
|
deltaB[^1].replace(diff)
|
|
deltaW[^1].replace(activations[^2].transpose())
|
|
# Backwards pass (actually the backwards pass began two lines earlier, we're just feeding
|
|
# the correction back through the rest of the network now)
|
|
for l in 1..<self.layers.high():
|
|
diff = self.layers[^l].weights.transpose().dot(diff) * self.activation.derivative(unactivated[^l])
|
|
deltaB[^l].replace(diff)
|
|
deltaW[^l].replace(diff.dot(activations[^l].transpose()))
|
|
return (deltaW, deltaB)
|
|
|
|
|
|
proc miniBatch(self: NeuralNetwork, data: seq[tuple[x, y: Matrix[float]]]) =
|
|
## Performs a single mini-batch step in stochastic gradient
|
|
## descent and updates the network's weights and biases
|
|
## accordingly
|
|
var gradient: tuple[weights, biases: seq[Matrix[float]]]
|
|
# New weights and biases
|
|
var
|
|
weights: seq[Matrix[float]] = @[]
|
|
biases: seq[Matrix[float]] = @[]
|
|
for layer in self.layers:
|
|
weights.add(zeros[float](layer.weights.shape))
|
|
biases.add(zeros[float](layer.biases.shape))
|
|
for dataPoint in data:
|
|
gradient = self.backprop(dataPoint.x, dataPoint.y)
|
|
for i, (currentBiases, newBiases) in zip(biases, gradient.biases):
|
|
biases[i] = currentBiases + newBiases
|
|
for i, (currentWeights, newWeights) in zip(weights, gradient.weights):
|
|
weights[i] = currentWeights + newWeights
|
|
|
|
# We use hyperparameters such as the learn rate and momentum
|
|
# to further control how fast (or slowly) the network converges
|
|
# onto a local minimum of the gradient of our loss function. To
|
|
# be completely honest I'm not entirely sure why we're dividing the
|
|
# learn rate by the size of our batch (if you didn't already notice I
|
|
# stole a lot of this code. I swear I'm a good programmer. Please hire
|
|
# me): my best guess would be that this way it gets "normalized" (as
|
|
# if we were training on the entire dataset at once even though we
|
|
# aren't) when it's < 1 and are otherwise scaling it to the size of
|
|
# our batch when it's > 1. I have some vague ideas as to why that may
|
|
# make sense, but it's a wild guess really
|
|
var nudge = self.learnRate / data.len().float
|
|
if self.momentum > 0:
|
|
# I _could_ go look at how other libraries implement
|
|
# momentum, OR I could pull a formula out of my ass
|
|
# and hope it works. Let's run with that, hm?
|
|
nudge *= (1 / self.momentum)
|
|
# The backpropagation algorithm lets us find the direction of steepest ascent
|
|
# in the gradient of our cost function (which, remember, we're trying to minimize
|
|
# by climbing it down), so we subtract that from the current weights and biases
|
|
# to descend it the fastest (it's not actually *the* fastest because true gradient
|
|
# descent would perform this over all training samples, but it's a pretty good
|
|
# approximation nonetheless, it converges quickly and it actually helps prevent
|
|
# overfitting by not letting the network train over the same data over and over
|
|
# again)
|
|
for (layer, newBiases) in zip(self.layers, biases):
|
|
layer.biases = (layer.biases - nudge) * newBiases
|
|
for (layer, newWeights) in zip(self.layers, weights):
|
|
layer.weights = (layer.weights - nudge) * newWeights
|
|
|
|
|
|
proc train*(self: NeuralNetwork, epochs: int, batchSize: int, data: var seq[tuple[x, y: Matrix[float]]]) =
|
|
## Train the network on the given data for the speficied
|
|
## number of epochs using the given batch size by applying
|
|
## stochastic gradient descent
|
|
var batches: seq[seq[tuple[x, y: Matrix[float]]]]
|
|
for epoch in 0..<epochs:
|
|
# We shuffle the data so that different epochs work
|
|
# on different data points. This will hopefully help
|
|
# the network generalize its training onto unseen data
|
|
shuffle(data)
|
|
batches = @[]
|
|
var i = 0
|
|
while i < data.len():
|
|
batches.add(@[])
|
|
for j in 0..<batchSize:
|
|
batches[^1].add(data[i])
|
|
i += batchSize
|
|
for batch in batches:
|
|
self.miniBatch(batch)
|
|
echo &"Completed training epoch {epoch + 1}"
|
|
|
|
## Utility functions
|
|
|
|
# Mean squared error
|
|
proc mse(a, b: Matrix[float]): float =
|
|
result = (b - a).apply(proc (x: float): float = pow(x, 2), axis = -1).sum() / len(a).float
|
|
|
|
# Derivative of MSE
|
|
func dxMSE(x, y: Matrix[float]): Matrix[float] = 2.0 * (x - y)
|
|
|
|
|
|
func sigmoid(x: float): float = 1 / (1 + exp(-x))
|
|
|
|
# A bunch of vectorized activation functions
|
|
|
|
proc sigmoid(input: Matrix[float]): Matrix[float] =
|
|
result = input.apply(sigmoid, axis = -1)
|
|
|
|
proc sigmoidDerivative(input: Matrix[float]): Matrix[float] = sigmoid(input) * (1.0 - sigmoid(input))
|
|
|
|
|
|
proc softmax(input: Matrix[float]): Matrix[float] =
|
|
# This is the good kind of softmax (stole it from
|
|
# stackoverflow lol) which means it doesn't violently
|
|
# detonate if the input gets too large because
|
|
# of the exponentials. I love the internet!
|
|
var input = input - input.max()
|
|
result = input.apply(math.exp, axis = -1) / input.apply(math.exp, axis = -1).sum()
|
|
|
|
proc softmaxDerivative(input: Matrix[float]): Matrix[float] =
|
|
# I stole this too, by the way
|
|
var input = input.reshape(input.shape.cols, 1)
|
|
# I _love_ stealing functions from numpy!
|
|
result = input.diagflat() - input.dot(input.transpose())
|
|
|
|
proc relu(input: Matrix[float]): Matrix[float] = input.apply(proc (x: float): float = max(0.0, x), axis = -1)
|
|
proc dxRelu(input: Matrix[float]): Matrix[float] = where(input > 0.0, ones[float](input.shape), 0)
|
|
|
|
proc silu(input: Matrix[float]): Matrix[float] = input.apply(proc (x: float): float = x * sigmoid(x), axis= -1)
|
|
proc dSilu(input: Matrix[float]): Matrix[float] = input.apply(proc (x: float): float = sigmoid(x) * (1 + x * (1 - sigmoid(x))), axis = -1)
|
|
|
|
proc htan(input: Matrix[float]): Matrix[float] =
|
|
let f = proc (x: float): float =
|
|
let temp = exp(2 * x)
|
|
result = (temp - 1) / (temp + 1)
|
|
input.apply(f, axis = -1)
|
|
|
|
proc htanDx(input: Matrix[float]): Matrix[float] = input.apply(proc (x: float): float = 1 - (pow(tanh(x), 2)), axis = -1)
|
|
|
|
{.push.}
|
|
{.hints: off.} # So nim doesn't complain about the naming
|
|
var Sigmoid* = newActivation(sigmoid, sigmoidDerivative)
|
|
var Softmax* = newActivation(softmax, softmaxDerivative)
|
|
var ReLU* = newActivation(relu, dxRelu)
|
|
var SiLU* = newActivation(silu, dSilu)
|
|
var HTan* = newActivation(htan, htanDx)
|
|
var MSE* = newLoss(mse, dxMSE)
|
|
{.pop.}
|
|
|
|
|