# Copyright 2023 Mattia Giambirtone & All Contributors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import ../util/matrix import std/strformat import std/random import std/math import std/sequtils randomize() type NeuralNetwork* = ref object ## A generic feed-forward ## neural network layers*: seq[Layer] loss: Loss # The network's cost function activation: Activation # The network's activation function # The network's learn rate determines # the amount of progress that is made # at each step when performing gradient # descent learnRate*: float # The momentum serves to speed up convergence # time when performing SGD: the higher the output # of the derivative of the cost function, the more # we nudge our inputs for our next epoch momentum*: float Loss* = ref object ## A vectorized loss function and its derivative function: proc (a, b: Matrix[float]): float derivative: proc (x, y: Matrix[float]): Matrix[float] Activation* = ref object ## A vectorized activation function and its ## derivative function: proc (input: Matrix[float]): Matrix[float] derivative: proc (x: Matrix[float]): Matrix[float] LayerKind* = enum ## A layer enumeration Dense, Dropout, Sparse Layer* = ref object ## A generic neural network ## layer kind*: LayerKind # TODO (add dropout and sparse layer!) inputSize*: int # The number of inputs we process outputSize*: int # The number of outputs we produce weights*: Matrix[float] # The weights for each connection (2D) biases*: Matrix[float] # The biases for each neuron (1D) proc `$`*(self: Layer): string = ## Returns a string representation ## of the layer result = &"Layer(inputs={self.inputSize}, outputs={self.outputSize})" proc `$`*(self: NeuralNetwork): string = ## Returns a string representation ## of the network result = &"NeuralNetwork(learnRate={self.learnRate}, layers={self.layers})" proc newLoss*(function: proc (a, b: Matrix[float]): float, derivative: proc (x, y: Matrix[float]): Matrix[float]): Loss = ## Creates a new Loss object new(result) result.function = function result.derivative = derivative proc newActivation*(function: proc (input: Matrix[float]): Matrix[float], derivative: proc (x: Matrix[float]): Matrix[float]): Activation = ## Creates a new Activation object new(result) result.function = function result.derivative = derivative proc newDenseLayer*(inputSize: int, outputSize: int): Layer = ## Creates a new dense layer with inputSize input ## parameters and outputSize outgoing outputs. new(result) result.inputSize = inputSize result.outputSize = outputSize result.kind = Dense proc newNeuralNetwork*(topology: seq[Layer], lossFunc: Loss, activationFunc: Activation, learnRate: float, momentum: float, weightRange, biasRange: tuple[start, stop: float]): NeuralNetwork = ## Initializes a new neural network with ## the given topology and hyperparameters. ## Weights and biases are initialized with ## random values in the chosen range using ## nim's default PRNG when not defined(release): if momentum > 1.0: raise newException(ValueError, "momentum should not be greater than one") new(result) result.layers = topology for layer in result.layers: var biases = newSeqOfCap[float](layer.outputSize) for _ in 0.. 1. I have some vague ideas as to why that may # make sense, but it's a wild guess really var nudge = self.learnRate / data.len().float if self.momentum > 0: # I _could_ go look at how other libraries implement # momentum, OR I could pull a formula out of my ass # and hope it works. Let's run with that, hm? nudge *= (1 / self.momentum) # The backpropagation algorithm lets us find the direction of steepest ascent # in the gradient of our cost function (which, remember, we're trying to minimize # by climbing it down), so we subtract that from the current weights and biases # to descend it the fastest (it's not actually *the* fastest because true gradient # descent would perform this over all training samples, but it's a pretty good # approximation nonetheless, it converges quickly and it actually helps prevent # overfitting by not letting the network train over the same data over and over # again) for (layer, newBiases) in zip(self.layers, biases): layer.biases = layer.biases - (newBiases * nudge) for (layer, newWeights) in zip(self.layers, weights): layer.weights = layer.weights - (newWeights * nudge) proc eval(self: NeuralNetwork, data: seq[tuple[x, y: Matrix[float]]]): float = for sample in data: result = result + self.loss.function(self.fastFeedForward(sample.x), sample.y) result = result / len(data).float proc train*(self: NeuralNetwork, epochs: int, batchSize: int, data: var seq[tuple[x, y: Matrix[float]]], test: seq[tuple[x, y: Matrix[float]]] = @[]) = ## Train the network on the given data for the speficied ## number of epochs using the given batch size by applying ## stochastic gradient descent. If some test data is provided, ## training progress is printed out after each epoch var batches: seq[seq[tuple[x, y: Matrix[float]]]] for epoch in 0.. 0: echo &"Cost: {self.eval(test)}" ## Utility functions # Mean squared error proc mse(a, b: Matrix[float]): float = result = (b - a).apply(proc (x: float): float = pow(x, 2), axis = -1).sum() / len(a).float # Derivative of MSE func dxMSE(x, y: Matrix[float]): Matrix[float] = 2.0 * (x - y) func sigmoid(x: float): float = 1 / (1 + exp(-x)) # A bunch of vectorized activation functions proc sigmoid(input: Matrix[float]): Matrix[float] = result = input.apply(sigmoid, axis = -1) proc sigmoidDerivative(input: Matrix[float]): Matrix[float] = sigmoid(input) * (1.0 - sigmoid(input)) proc softmax(input: Matrix[float]): Matrix[float] = # This is the good kind of softmax (stole it from # stackoverflow lol) which means it doesn't violently # detonate if the input gets too large because # of the exponentials. I love the internet! var input = input - input.max() result = input.apply(math.exp, axis = -1) / input.apply(math.exp, axis = -1).sum() proc softmaxDerivative(input: Matrix[float]): Matrix[float] = # I stole this too, by the way var input = input.reshape(input.shape.cols, 1) # I _love_ stealing functions from numpy! result = (input.diagflat() - input.dot(input.transpose())).flatten( ) proc relu(input: Matrix[float]): Matrix[float] = input.apply(proc (x: float): float = max(0.0, x), axis = -1) proc dxRelu(input: Matrix[float]): Matrix[float] = where(input > 0.0, ones[float](input.shape), 0) proc silu(input: Matrix[float]): Matrix[float] = input.apply(proc (x: float): float = x * sigmoid(x), axis= -1) proc dSilu(input: Matrix[float]): Matrix[float] = input.apply(proc (x: float): float = sigmoid(x) * (1 + x * (1 - sigmoid(x))), axis = -1) proc htan(input: Matrix[float]): Matrix[float] = let f = proc (x: float): float = let temp = exp(2 * x) result = (temp - 1) / (temp + 1) input.apply(f, axis = -1) proc htanDx(input: Matrix[float]): Matrix[float] = input.apply(proc (x: float): float = 1 - (pow(tanh(x), 2)), axis = -1) {.push.} {.hints: off.} # So nim doesn't complain about the naming var Sigmoid* = newActivation(sigmoid, sigmoidDerivative) var Softmax* = newActivation(softmax, softmaxDerivative) var ReLU* = newActivation(relu, dxRelu) var SiLU* = newActivation(silu, dSilu) var HTan* = newActivation(htan, htanDx) var MSE* = newLoss(mse, dxMSE) {.pop.}