# Copyright 2023 Mattia Giambirtone & All Contributors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import ../util/matrix


import std/strformat
import std/random
import std/math
import std/sequtils


randomize()


type
    NeuralNetwork* = ref object
        ## A generic feed-forward 
        ## neural network
        layers*: seq[Layer]
        loss: Loss                            # The network's cost function
        activation: Activation                # The network's activation function
        # The network's learn rate determines
        # the amount of progress that is made
        # at each step when performing gradient
        # descent
        learnRate*: float
        # The momentum serves to speed up convergence
        # time when performing SGD: the higher the output
        # of the derivative of the cost function, the more
        # we nudge our inputs for our next epoch
        momentum*: float
    Loss* = ref object
        ## A vectorized loss function and its derivative
        function: proc (a, b: Matrix[float]): float
        derivative: proc (x, y: Matrix[float]): Matrix[float]
    Activation* = ref object
        ## A vectorized activation function and its
        ## derivative
        function: proc (input: Matrix[float]): Matrix[float]
        derivative: proc (x: Matrix[float]): Matrix[float]
    LayerKind* = enum
        ## A layer enumeration
        Dense, Dropout, Sparse
    Layer* = ref object
        ## A generic neural network
        ## layer
        kind*: LayerKind                                     # TODO (add dropout and sparse layer!)
        inputSize*: int                                      # The number of inputs we process
        outputSize*: int                                     # The number of outputs we produce
        weights*: Matrix[float]                              # The weights for each connection (2D)
        biases*: Matrix[float]                               # The biases for each neuron (1D)


proc `$`*(self: Layer): string =
    ## Returns a string representation
    ## of the layer
    result = &"Layer(inputs={self.inputSize}, outputs={self.outputSize})"


proc `$`*(self: NeuralNetwork): string =
    ## Returns a string representation
    ## of the network
    result = &"NeuralNetwork(learnRate={self.learnRate}, layers={self.layers})"


proc newLoss*(function: proc (a, b: Matrix[float]): float, derivative: proc (x, y: Matrix[float]): Matrix[float]): Loss =
    ## Creates a new Loss object
    new(result)
    result.function = function
    result.derivative = derivative


proc newActivation*(function: proc (input: Matrix[float]): Matrix[float], derivative: proc (x: Matrix[float]): Matrix[float]): Activation =
    ## Creates a new Activation object
    new(result)
    result.function = function
    result.derivative = derivative


proc newDenseLayer*(inputSize: int, outputSize: int): Layer =
    ## Creates a new dense layer with inputSize input 
    ## parameters and outputSize outgoing outputs.
    new(result)
    result.inputSize = inputSize
    result.outputSize = outputSize
    result.kind = Dense


proc newNeuralNetwork*(topology: seq[Layer], lossFunc: Loss, activationFunc: Activation, 
                       learnRate: float, momentum: float, weightRange, 
                       biasRange: tuple[start, stop: float]): NeuralNetwork =
    ## Initializes a new neural network with
    ## the given topology and hyperparameters.
    ## Weights and biases are initialized with 
    ## random values in the chosen range using
    ## nim's default PRNG
    new(result)
    result.layers = topology
    for layer in result.layers:
        var biases = newSeqOfCap[float](layer.outputSize)
        for _ in 0..<layer.outputSize:
            biases.add(rand(biasRange.start..biasRange.stop))
        var weights = newSeqOfCap[float](layer.inputSize * layer.outputSize)
        for _ in 0..<layer.outputSize:
            for _ in 0..<layer.inputSize:
                weights.add(rand(weightRange.start..weightRange.stop))
        layer.biases = newMatrix[float](biases)
        # Why swap outputSize and inputSize in the matrix shape? The reason is simple: this
        # spares us from having to transpose it later when we perform the dot product (I get
        # that it's a constant time operation, but if we can avoid it altogether, that's even
        # better!)
        layer.weights = newMatrixFromSeq[float](weights, (layer.outputSize, layer.inputSize))
    result.loss = lossFunc
    result.activation = activationFunc
    result.learnRate = learnRate
    result.momentum = momentum


proc feed(self: Layer, x: Matrix[float]): Matrix[float] =
    ## Feeds the given input to the layer.
    ## The layer's output is returned
    result = self.weights.dot(x) + self.biases


proc fastFeedForward(self: NeuralNetwork, x: Matrix[float]): Matrix[float] {.used.} =
    ## Feeds the given input through the network. The
    ## (unactivated) output from the last layer is returned
    result = x
    for layer in self.layers:
        result = layer.feed(result)


proc feedForward(self: NeuralNetwork, x: Matrix[float]): seq[Matrix[float]] =
    ## Feeds the given input through the network.
    ## All unactivated outputs from each layer are
    ## returned in order
    result.add(x)
    for layer in self.layers:
        result.add(layer.feed(result[^1]))

# TODO: Consider optimizing this to take n m-length vectors in an m*n matrix instead
# of calling it with every sample in every mini-batch to offload the heavylifting
# to the matrix library (this hasn't been done yet, both for simplicity purposes and 
# also because the matrix library we use is horribly inefficient anyway)
proc backprop(self: NeuralNetwork, x, y: Matrix[float]): tuple[weights, biases: seq[Matrix[float]]] =
    ## Performs a single backpropagation step with the given
    ## training sample and returns the direction of steepest 
    ## ascent for the gradient of the network's cost function 
    ## w.r.t the weights and biases
    var
        # The deltas for the weights and biases of
        # each layer in the network
        deltaW: seq[Matrix[float]] = @[]
        deltaB: seq[Matrix[float]] = @[]
        # Activations of each layer
        activations: seq[Matrix[float]] = @[]
        # Unactivated outputs of each layer
        unactivated: seq[Matrix[float]] = @[]

    # Initialize all of our deltas to zero
    for layer in self.layers:
        deltaW.add(zeros[float](layer.weights.shape))
        deltaB.add(zeros[float](layer.biases.shape))

    # Forward pass through the network
    unactivated = self.feedForward(x)
    # TODO: This can probably be optimized
    for unact in unactivated:
        activations.add(self.activation.function(unact))
    # This stores the gradient of each layer for this sample: since it is a 
    # partial derivative the multiplication here is just an application of the 
    # chain rule (Because while the cost function does indeed depend on the 
    # weights and biases, they aren't explicit arguments to it, which means we 
    # have to do fancy calculus stuff to figure out the derivative)
    var diff: Matrix[float] = self.loss.derivative(activations[^1], y) * self.activation.derivative(unactivated[^1])
    deltaB[^1].replace(diff)
    deltaW[^1].replace(activations[^2].transpose())
    # Backwards pass (actually the backwards pass began two lines earlier, we're just feeding
    # the correction back through the rest of the network now)
    for l in 1..<self.layers.high():
        diff = self.layers[^l].weights.transpose().dot(diff) * self.activation.derivative(unactivated[^l])
        deltaB[^l].replace(diff)
        deltaW[^l].replace(diff.dot(activations[^l].transpose()))
    return (deltaW, deltaB)


proc miniBatch(self: NeuralNetwork, data: seq[tuple[x, y: Matrix[float]]]) =
    ## Performs a single mini-batch step in stochastic gradient
    ## descent and updates the network's weights and biases
    ## accordingly
    var gradient: tuple[weights, biases: seq[Matrix[float]]]
    # New weights and biases
    var 
        weights: seq[Matrix[float]] = @[]
        biases: seq[Matrix[float]] = @[]
    for layer in self.layers:
        weights.add(zeros[float](layer.weights.shape))
        biases.add(zeros[float](layer.biases.shape))
    for dataPoint in data:
        gradient = self.backprop(dataPoint.x, dataPoint.y)
        for i, (currentBiases, newBiases) in zip(biases, gradient.biases):
            biases[i] = currentBiases + newBiases
        for i, (currentWeights, newWeights) in zip(weights, gradient.weights):
            weights[i] = currentWeights + newWeights
    
    # We use hyperparameters such as the learn rate and momentum
    # to further control how fast (or slowly) the network converges
    # onto a local minimum of the gradient of our loss function. To
    # be completely honest I'm not entirely sure why we're dividing the
    # learn rate by the size of our batch (if you didn't already notice I
    # stole a lot of this code. I swear I'm a good programmer. Please hire
    # me): my best guess would be that this way it gets "normalized" (as 
    # if we were training on the entire dataset at once even though we 
    # aren't) when it's < 1 and are otherwise scaling it to the size of 
    # our batch when it's > 1. I have some vague ideas as to why that may
    # make sense, but it's a wild guess really
    var nudge = self.learnRate / data.len().float
    if self.momentum > 0:
        # I _could_ go look at how other libraries implement
        # momentum, OR I could pull a formula out of my ass
        # and hope it works. Let's run with that, hm?
        nudge *= (1 / self.momentum)
    # The backpropagation algorithm lets us find the direction of steepest ascent
    # in the gradient of our cost function (which, remember, we're trying to minimize
    # by climbing it down), so we subtract that from the current weights and biases
    # to descend it the fastest (it's not actually *the* fastest because true gradient
    # descent would perform this over all training samples, but it's a pretty good
    # approximation nonetheless, it converges quickly and it actually helps prevent
    # overfitting by not letting the network train over the same data over and over
    # again)
    for (layer, newBiases) in zip(self.layers, biases):
        layer.biases = (layer.biases - nudge) * newBiases
    for (layer, newWeights) in zip(self.layers, weights):
        layer.weights = (layer.weights - nudge) * newWeights


proc train*(self: NeuralNetwork, epochs: int, batchSize: int, data: var seq[tuple[x, y: Matrix[float]]]) =
    ## Train the network on the given data for the speficied 
    ## number of epochs using the given batch size by applying
    ## stochastic gradient descent
    var batches: seq[seq[tuple[x, y: Matrix[float]]]]
    for epoch in 0..<epochs:
        # We shuffle the data so that different epochs work
        # on different data points. This will hopefully help
        # the network generalize its training onto unseen data
        shuffle(data)
        batches = @[]
        var i = 0
        while i < data.len():
            batches.add(@[])
            for j in 0..<batchSize:
                batches[^1].add(data[i])
            i += batchSize
        for batch in batches:
            self.miniBatch(batch)
        echo &"Completed training epoch {epoch + 1}"

## Utility functions

# Mean squared error
proc mse(a, b: Matrix[float]): float = 
    result = (b - a).apply(proc (x: float): float = pow(x, 2), axis = -1).sum() / len(a).float

# Derivative of MSE
func dxMSE(x, y: Matrix[float]): Matrix[float] = 2.0 * (x - y)


func sigmoid(x: float): float = 1 / (1 + exp(-x))

# A bunch of vectorized activation functions

proc sigmoid(input: Matrix[float]): Matrix[float] = 
    result = input.apply(sigmoid, axis = -1)

proc sigmoidDerivative(input: Matrix[float]): Matrix[float] = sigmoid(input) * (1.0 - sigmoid(input))


proc softmax(input: Matrix[float]): Matrix[float] = 
    # This is the good kind of softmax (stole it from
    # stackoverflow lol) which means it doesn't violently
    # detonate if the input gets too large because
    # of the exponentials. I love the internet!
    var input = input - input.max()
    result = input.apply(math.exp, axis = -1) / input.apply(math.exp, axis = -1).sum()

proc softmaxDerivative(input: Matrix[float]): Matrix[float] =
    # I stole this too, by the way
    var input = input.reshape(input.shape.cols, 1)
    # I _love_ stealing functions from numpy!
    result = input.diagflat() - input.dot(input.transpose())

proc relu(input: Matrix[float]): Matrix[float] = input.apply(proc (x: float): float = max(0.0, x), axis = -1)
proc dxRelu(input: Matrix[float]): Matrix[float] = where(input > 0.0, ones[float](input.shape), 0)

proc silu(input: Matrix[float]): Matrix[float] = input.apply(proc (x: float): float = x * sigmoid(x), axis= -1)
proc dSilu(input: Matrix[float]): Matrix[float] = input.apply(proc (x: float): float = sigmoid(x) * (1 + x * (1 - sigmoid(x))), axis = -1)

proc htan(input: Matrix[float]): Matrix[float] = 
    let f = proc (x: float): float = 
        let temp = exp(2 * x)
        result = (temp - 1) / (temp + 1)
    input.apply(f, axis = -1)

proc htanDx(input: Matrix[float]): Matrix[float] = input.apply(proc (x: float): float = 1 - (pow(tanh(x), 2)), axis = -1)

{.push.}
{.hints: off.}   # So nim doesn't complain about the naming
var Sigmoid* = newActivation(sigmoid, sigmoidDerivative)
var Softmax* = newActivation(softmax, softmaxDerivative)
var ReLU* = newActivation(relu, dxRelu)
var SiLU* = newActivation(silu, dSilu)
var HTan* = newActivation(htan, htanDx)
var MSE* = newLoss(mse, dxMSE)
{.pop.}