# Copyright 2023 Mattia Giambirtone & All Contributors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import ../util/matrix import std/strformat import std/random import std/math import std/sequtils randomize() type NeuralNetwork* = ref object ## A generic feed-forward ## neural network layers*: seq[Layer] loss: Loss # The network's cost function activation: Activation # The network's activation function # The network's learn rate determines # the amount of progress that is made # at each step when performing gradient # descent learnRate*: float # The momentum serves to speed up convergence # time when performing SGD: the higher the output # of the derivative of the cost function, the more # we nudge our inputs for our next epoch momentum*: float Loss* = ref object ## A vectorized loss function and its derivative function: proc (a, b: Matrix[float]): float derivative: proc (x, y: Matrix[float]): Matrix[float] Activation* = ref object ## A vectorized activation function and its ## derivative function: proc (input: Matrix[float]): Matrix[float] derivative: proc (x: Matrix[float]): Matrix[float] LayerKind* = enum ## A layer enumeration Dense, Dropout, Sparse Layer* = ref object ## A generic neural network ## layer kind*: LayerKind # TODO (add dropout and sparse layer!) inputSize*: int # The number of inputs we process outputSize*: int # The number of outputs we produce weights*: Matrix[float] # The weights for each connection (2D) biases*: Matrix[float] # The biases for each neuron (1D) proc `$`*(self: Layer): string = ## Returns a string representation ## of the layer result = &"Layer(inputs={self.inputSize}, outputs={self.outputSize})" proc `$`*(self: NeuralNetwork): string = ## Returns a string representation ## of the network result = &"NeuralNetwork(learnRate={self.learnRate}, layers={self.layers})" proc newLoss*(function: proc (a, b: Matrix[float]): float, derivative: proc (x, y: Matrix[float]): Matrix[float]): Loss = ## Creates a new Loss object new(result) result.function = function result.derivative = derivative proc newActivation*(function: proc (input: Matrix[float]): Matrix[float], derivative: proc (x: Matrix[float]): Matrix[float]): Activation = ## Creates a new Activation object new(result) result.function = function result.derivative = derivative proc newDenseLayer*(inputSize: int, outputSize: int): Layer = ## Creates a new dense layer with inputSize input ## parameters and outputSize outgoing outputs. new(result) result.inputSize = inputSize result.outputSize = outputSize result.kind = Dense proc newNeuralNetwork*(topology: seq[Layer], lossFunc: Loss, activationFunc: Activation, learnRate: float, momentum: float, weightRange, biasRange: tuple[start, stop: float]): NeuralNetwork = ## Initializes a new neural network with ## the given topology and hyperparameters. ## Weights and biases are initialized with ## random values in the chosen range using ## nim's default PRNG new(result) result.layers = topology for layer in result.layers: var biases = newSeqOfCap[float](layer.outputSize) for _ in 0.. 1. I have some vague ideas as to why that may # make sense, but it's a wild guess really var nudge = self.learnRate / data.len().float if self.momentum > 0: # I _could_ go look at how other libraries implement # momentum, OR I could pull a formula out of my ass # and hope it works. Let's run with that, hm? nudge *= (1 / self.momentum) # The backpropagation algorithm lets us find the direction of steepest ascent # in the gradient of our cost function (which, remember, we're trying to minimize # by climbing it down), so we subtract that from the current weights and biases # to descend it the fastest (it's not actually *the* fastest because true gradient # descent would perform this over all training samples, but it's a pretty good # approximation nonetheless, it converges quickly and it actually helps prevent # overfitting by not letting the network train over the same data over and over # again) for (layer, newBiases) in zip(self.layers, biases): layer.biases = (layer.biases - nudge) * newBiases for (layer, newWeights) in zip(self.layers, weights): layer.weights = (layer.weights - nudge) * newWeights proc train*(self: NeuralNetwork, epochs: int, batchSize: int, data: var seq[tuple[x, y: Matrix[float]]]) = ## Train the network on the given data for the speficied ## number of epochs using the given batch size by applying ## stochastic gradient descent var batches: seq[seq[tuple[x, y: Matrix[float]]]] for epoch in 0.. 0.0, ones[float](input.shape), 0) proc silu(input: Matrix[float]): Matrix[float] = input.apply(proc (x: float): float = x * sigmoid(x), axis= -1) proc dSilu(input: Matrix[float]): Matrix[float] = input.apply(proc (x: float): float = sigmoid(x) * (1 + x * (1 - sigmoid(x))), axis = -1) proc htan(input: Matrix[float]): Matrix[float] = let f = proc (x: float): float = let temp = exp(2 * x) result = (temp - 1) / (temp + 1) input.apply(f, axis = -1) proc htanDx(input: Matrix[float]): Matrix[float] = input.apply(proc (x: float): float = 1 - (pow(tanh(x), 2)), axis = -1) {.push.} {.hints: off.} # So nim doesn't complain about the naming var Sigmoid* = newActivation(sigmoid, sigmoidDerivative) var Softmax* = newActivation(softmax, softmaxDerivative) var ReLU* = newActivation(relu, dxRelu) var SiLU* = newActivation(silu, dSilu) var HTan* = newActivation(htan, htanDx) var MSE* = newLoss(mse, dxMSE) {.pop.}