Simplified the design, minor additions
This commit is contained in:
parent
a97cec41a6
commit
9baede9b54
|
@ -1,2 +1,11 @@
|
||||||
# NNExperiments
|
# NNExperiments
|
||||||
|
|
||||||
|
AI stuff.
|
||||||
|
|
||||||
|
## TODOs
|
||||||
|
|
||||||
|
- Regularization (L1/L2)
|
||||||
|
- Implement momentum
|
||||||
|
- Optimize matrix multiplication
|
||||||
|
- ???
|
||||||
|
- Profit
|
||||||
|
|
10
src/main.nim
10
src/main.nim
|
@ -2,9 +2,11 @@ import nn/network
|
||||||
import nn/util/matrix
|
import nn/util/matrix
|
||||||
|
|
||||||
|
|
||||||
var mlp = newNeuralNetwork(@[newDenseLayer(784, 10, Sigmoid),
|
var mlp = newNeuralNetwork(@[newDenseLayer(784, 10),
|
||||||
newDenseLayer(10, 16, Sigmoid),
|
newDenseLayer(10, 16),
|
||||||
newDenseLayer(16, 10, Softmax)],
|
newDenseLayer(16, 10)],
|
||||||
lossFunc=MSE, learnRate=0.05, momentum=0.55,
|
lossFunc=MSE, activationFunc=Softmax,
|
||||||
|
learnRate=0.05, momentum=0.55,
|
||||||
weightRange=(start: -1.0, stop: 1.0),
|
weightRange=(start: -1.0, stop: 1.0),
|
||||||
biasRange=(start: -1.0, stop: 1.0))
|
biasRange=(start: -1.0, stop: 1.0))
|
||||||
|
|
||||||
|
|
|
@ -29,7 +29,8 @@ type
|
||||||
## A generic feed-forward
|
## A generic feed-forward
|
||||||
## neural network
|
## neural network
|
||||||
layers*: seq[Layer]
|
layers*: seq[Layer]
|
||||||
loss: Loss # The cost function along with its derivative
|
loss: Loss # The network's cost function
|
||||||
|
activation: Activation # The network's activation function
|
||||||
# The network's learn rate determines
|
# The network's learn rate determines
|
||||||
# the amount of progress that is made
|
# the amount of progress that is made
|
||||||
# at each step when performing gradient
|
# at each step when performing gradient
|
||||||
|
@ -41,23 +42,25 @@ type
|
||||||
# we nudge our inputs for our next epoch
|
# we nudge our inputs for our next epoch
|
||||||
momentum*: float
|
momentum*: float
|
||||||
Loss* = ref object
|
Loss* = ref object
|
||||||
## A loss function and its derivative
|
## A vectorized loss function and its derivative
|
||||||
function: proc (a, b: Matrix[float]): float
|
function: proc (a, b: Matrix[float]): float
|
||||||
derivative: proc (x, y: Matrix[float]): Matrix[float] {.noSideEffect.}
|
derivative: proc (x, y: Matrix[float]): Matrix[float] {.noSideEffect.}
|
||||||
Activation* = ref object
|
Activation* = ref object
|
||||||
## An activation function
|
## A vectorized activation function and its
|
||||||
|
## derivative
|
||||||
function: proc (input: Matrix[float]): Matrix[float] {.noSideEffect.}
|
function: proc (input: Matrix[float]): Matrix[float] {.noSideEffect.}
|
||||||
derivative: proc (x: Matrix[float]): Matrix[float] {.noSideEffect.}
|
derivative: proc (x: Matrix[float]): Matrix[float] {.noSideEffect.}
|
||||||
|
LayerKind* = enum
|
||||||
|
## A layer enumeration
|
||||||
|
Dense, Dropout, Sparse
|
||||||
Layer* = ref object
|
Layer* = ref object
|
||||||
## A generic neural network
|
## A generic neural network
|
||||||
## layer
|
## layer
|
||||||
|
kind*: LayerKind # TODO (add dropout and sparse layer!)
|
||||||
inputSize*: int # The number of inputs we process
|
inputSize*: int # The number of inputs we process
|
||||||
outputSize*: int # The number of outputs we produce
|
outputSize*: int # The number of outputs we produce
|
||||||
weights*: Matrix[float] # The weights for each connection (2D)
|
weights*: Matrix[float] # The weights for each connection (2D)
|
||||||
biases*: Matrix[float] # The biases for each neuron (1D)
|
biases*: Matrix[float] # The biases for each neuron (1D)
|
||||||
gradients: tuple[weights, biases: Matrix[float]] # Gradient coefficients for weights and biases
|
|
||||||
activation: Activation # The layer's activation function
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
proc `$`*(self: Layer): string =
|
proc `$`*(self: Layer): string =
|
||||||
|
@ -86,82 +89,109 @@ proc newActivation*(function: proc (input: Matrix[float]): Matrix[float] {.noSid
|
||||||
result.derivative = derivative
|
result.derivative = derivative
|
||||||
|
|
||||||
|
|
||||||
proc newDenseLayer*(inputSize: int, outputSize: int, activationFunc: Activation): Layer =
|
proc newDenseLayer*(inputSize: int, outputSize: int): Layer =
|
||||||
## Creates a new dense layer with inputSize input
|
## Creates a new dense layer with inputSize input
|
||||||
## parameters and outputSize outgoing outputs and
|
## parameters and outputSize outgoing outputs.
|
||||||
## using the chosen activation function.
|
|
||||||
new(result)
|
new(result)
|
||||||
result.inputSize = inputSize
|
result.inputSize = inputSize
|
||||||
result.outputSize = outputSize
|
result.outputSize = outputSize
|
||||||
result.activation = activationFunc
|
result.kind = Dense
|
||||||
|
|
||||||
|
|
||||||
proc newNeuralNetwork*(topology: seq[Layer], lossFunc: Loss, learnRate: float, momentum: float,
|
proc newNeuralNetwork*(topology: seq[Layer], lossFunc: Loss, activationFunc: Activation,
|
||||||
weightRange, biasRange: tuple[start, stop: float]): NeuralNetwork =
|
learnRate: float, momentum: float, weightRange,
|
||||||
|
biasRange: tuple[start, stop: float]): NeuralNetwork =
|
||||||
## Initializes a new neural network with
|
## Initializes a new neural network with
|
||||||
## the given topology and hyperparameters.
|
## the given topology and hyperparameters.
|
||||||
## Weights and biases are initialized with
|
## Weights and biases are initialized with
|
||||||
## random values in the chosen range
|
## random values in the chosen range using
|
||||||
|
## nim's default PRNG
|
||||||
new(result)
|
new(result)
|
||||||
result.layers = topology
|
result.layers = topology
|
||||||
for layer in result.layers:
|
for layer in result.layers:
|
||||||
var biases = newSeqOfCap[float](layer.outputSize)
|
var biases = newSeqOfCap[float](layer.outputSize)
|
||||||
var biasGradients = newSeqOfCap[float](layer.outputSize)
|
|
||||||
for _ in 0..<layer.outputSize:
|
for _ in 0..<layer.outputSize:
|
||||||
biases.add(rand(biasRange.start..biasRange.stop))
|
biases.add(rand(biasRange.start..biasRange.stop))
|
||||||
biasGradients.add(0.0)
|
|
||||||
var weights = newSeqOfCap[float](layer.inputSize * layer.outputSize)
|
var weights = newSeqOfCap[float](layer.inputSize * layer.outputSize)
|
||||||
var weightGradients = newSeqOfCap[float](layer.inputSize * layer.outputSize)
|
|
||||||
for _ in 0..<layer.outputSize:
|
for _ in 0..<layer.outputSize:
|
||||||
for _ in 0..<layer.inputSize:
|
for _ in 0..<layer.inputSize:
|
||||||
weights.add(rand(weightRange.start..weightRange.stop))
|
weights.add(rand(weightRange.start..weightRange.stop))
|
||||||
weightGradients.add(0.0)
|
|
||||||
layer.biases = newMatrix[float](biases)
|
layer.biases = newMatrix[float](biases)
|
||||||
# Why swap outputSize and inputSize in the matrix shape? The reason is simple: this
|
# Why swap outputSize and inputSize in the matrix shape? The reason is simple: this
|
||||||
# spares us from having to transpose it later when we perform the dot product (I get
|
# spares us from having to transpose it later when we perform the dot product (I get
|
||||||
# that it's a constant time operation, but if we can avoid it altogether, that's even
|
# that it's a constant time operation, but if we can avoid it altogether, that's even
|
||||||
# better!)
|
# better!)
|
||||||
layer.weights = newMatrixFromSeq[float](weights, (layer.outputSize, layer.inputSize))
|
layer.weights = newMatrixFromSeq[float](weights, (layer.outputSize, layer.inputSize))
|
||||||
layer.gradients = (weights: newMatrix[float](weightGradients),
|
|
||||||
biases: newMatrixFromSeq[float](biasGradients, (layer.outputSize, layer.inputSize)))
|
|
||||||
result.loss = lossFunc
|
result.loss = lossFunc
|
||||||
|
result.activation = activationFunc
|
||||||
result.learnRate = learnRate
|
result.learnRate = learnRate
|
||||||
result.momentum = momentum
|
result.momentum = momentum
|
||||||
|
|
||||||
|
|
||||||
|
proc feed(self: Layer, x: Matrix[float]): Matrix[float] =
|
||||||
|
## Feeds the given input to the layer.
|
||||||
|
## The layer's output is returned
|
||||||
|
result = self.weights.dot(x) + self.biases
|
||||||
|
|
||||||
|
|
||||||
|
proc fastFeedForward(self: NeuralNetwork, x: Matrix[float]): Matrix[float] {.used.} =
|
||||||
|
## Feeds the given input through the network. The
|
||||||
|
## (unactivated) output from the last layer is returned
|
||||||
|
result = x
|
||||||
|
for layer in self.layers:
|
||||||
|
result = layer.feed(result)
|
||||||
|
|
||||||
|
|
||||||
|
proc feedForward(self: NeuralNetwork, x: Matrix[float]): seq[Matrix[float]] =
|
||||||
|
## Feeds the given input through the network.
|
||||||
|
## All unactivated outputs from each layer are
|
||||||
|
## returned in order
|
||||||
|
result.add(x)
|
||||||
|
for layer in self.layers:
|
||||||
|
result.add(layer.feed(result[^1]))
|
||||||
|
|
||||||
|
# TODO: Consider optimizing this to take n m-length vectors in an m*n matrix instead
|
||||||
|
# of calling it with every sample in every mini-batch to offload the heavylifting
|
||||||
|
# to the matrix library (this hasn't been done yet, both for simplicity purposes and
|
||||||
|
# also because the matrix library we use is horribly inefficient anyway)
|
||||||
proc backprop(self: NeuralNetwork, x, y: Matrix[float]): tuple[weights, biases: seq[Matrix[float]]] =
|
proc backprop(self: NeuralNetwork, x, y: Matrix[float]): tuple[weights, biases: seq[Matrix[float]]] =
|
||||||
## Performs a single backpropagation step and returns the
|
## Performs a single backpropagation step with the given
|
||||||
## gradient of the cost function for the weights and biases
|
## training sample and returns the direction of steepest
|
||||||
## of the network according to the given training sample
|
## ascent for the gradient of the network's cost function
|
||||||
|
## w.r.t the weights and biases
|
||||||
var
|
var
|
||||||
# The deltas for the weights and biases of
|
# The deltas for the weights and biases of
|
||||||
# each layer in the network
|
# each layer in the network
|
||||||
deltaW: seq[Matrix[float]] = @[]
|
deltaW: seq[Matrix[float]] = @[]
|
||||||
deltaB: seq[Matrix[float]] = @[]
|
deltaB: seq[Matrix[float]] = @[]
|
||||||
# Activations of each layer
|
# Activations of each layer
|
||||||
activation = x
|
activations: seq[Matrix[float]] = @[]
|
||||||
activations: seq[Matrix[float]] = @[x]
|
|
||||||
# Unactivated outputs of each layer
|
# Unactivated outputs of each layer
|
||||||
unactivated: seq[Matrix[float]] = @[]
|
unactivated: seq[Matrix[float]] = @[]
|
||||||
# Forward pass through the network
|
|
||||||
|
# Initialize all of our deltas to zero
|
||||||
for layer in self.layers:
|
for layer in self.layers:
|
||||||
deltaW.add(zeros[float](layer.weights.shape))
|
deltaW.add(zeros[float](layer.weights.shape))
|
||||||
deltaB.add(zeros[float](layer.biases.shape))
|
deltaB.add(zeros[float](layer.biases.shape))
|
||||||
unactivated.add(layer.weights.dot(activation) + layer.biases)
|
|
||||||
activations.add(layer.activation.function(unactivated[^1]))
|
|
||||||
# Backwards pass
|
|
||||||
|
|
||||||
# The negative gradient of each layer for this sample: this is a
|
# Forward pass through the network
|
||||||
# partial derivative, so the multiplication here is just an
|
unactivated = self.feedForward(x)
|
||||||
# application of the chain rule!
|
# TODO: This can probably be optimized
|
||||||
var diff: Matrix[float] = self.loss.derivative(activations[^1], y) * self.layers[^1].activation.derivative(unactivated[^1])
|
for unact in unactivated:
|
||||||
|
activations.add(self.activation.function(unact))
|
||||||
|
|
||||||
|
# This stores the gradient of each layer for this sample: since it is a
|
||||||
|
# partial derivative the multiplication here is just an application of the
|
||||||
|
# chain rule (Because while the cost function does indeed depend on the
|
||||||
|
# weights and biases, they aren't explicit arguments to it, which means we
|
||||||
|
# have to do fancy calculus stuff to figure out the derivative)
|
||||||
|
var diff: Matrix[float] = self.loss.derivative(activations[^1], y) * self.activation.derivative(unactivated[^1])
|
||||||
deltaB[^1].replace(diff)
|
deltaB[^1].replace(diff)
|
||||||
deltaW[^1].replace(activations[^2].transpose())
|
deltaW[^1].replace(activations[^2].transpose())
|
||||||
|
# Backwards pass (actually the backwards pass began two lines earlier, we're just feeding
|
||||||
|
# the correction back through the rest of the network now)
|
||||||
for l in 2..self.layers.high():
|
for l in 2..self.layers.high():
|
||||||
# The ^ makes our indeces start from the back instead of
|
diff = self.layers[^l].weights.transpose.dot(diff) * self.activation.derivative(unactivated[^l])
|
||||||
# from the front, so we're really iterating over our layers
|
|
||||||
# backwards!
|
|
||||||
diff = self.layers[^l].weights.transpose.dot(diff) * self.layers[^l].activation.derivative(unactivated[^l])
|
|
||||||
deltaB[^l].replace(diff)
|
deltaB[^l].replace(diff)
|
||||||
deltaW[^l].replace(diff.dot(activations[^(l - 1)].transpose()))
|
deltaW[^l].replace(diff.dot(activations[^(l - 1)].transpose()))
|
||||||
return (deltaW, deltaB)
|
return (deltaW, deltaB)
|
||||||
|
@ -185,8 +215,9 @@ proc miniBatch(self: NeuralNetwork, data: seq[tuple[x, y: Matrix[float]]]) =
|
||||||
biases[i] = currentBiases + newBiases
|
biases[i] = currentBiases + newBiases
|
||||||
for i, (currentWeights, newWeights) in zip(weights, gradient.weights):
|
for i, (currentWeights, newWeights) in zip(weights, gradient.weights):
|
||||||
weights[i] = currentWeights + newWeights
|
weights[i] = currentWeights + newWeights
|
||||||
# The backpropagation algorithm lets us find the gradient of steepest ascent
|
# The backpropagation algorithm lets us find the direction of steepest ascent
|
||||||
# in our cost function, so we subtract it from the current weights and biases
|
# in the gradient of our cost function (which, remember, we're trying to minimize
|
||||||
|
# by climbing it down), so we subtract that from the current weights and biases
|
||||||
# to descend it the fastest (it's not actually *the* fastest because true gradient
|
# to descend it the fastest (it's not actually *the* fastest because true gradient
|
||||||
# descent would perform this over all training samples, but it's a pretty good
|
# descent would perform this over all training samples, but it's a pretty good
|
||||||
# approximation nonetheless, it converges quickly and it actually helps prevent
|
# approximation nonetheless, it converges quickly and it actually helps prevent
|
||||||
|
@ -237,18 +268,26 @@ func sigmoidDerivative(input: Matrix[float]): Matrix[float] = sigmoid(input) * (
|
||||||
|
|
||||||
|
|
||||||
func softmax(input: Matrix[float]): Matrix[float] =
|
func softmax(input: Matrix[float]): Matrix[float] =
|
||||||
|
# This is the good kind of softmax (stole it from
|
||||||
|
# stackoverflow lol) which means it doesn't violently
|
||||||
|
# detonate if the input gets too large because
|
||||||
|
# of the exponentials. I love the internet!
|
||||||
var input = input - input.max()
|
var input = input - input.max()
|
||||||
result = input.apply(math.exp, axis = -1) / input.apply(math.exp, axis = -1).sum()
|
result = input.apply(math.exp, axis = -1) / input.apply(math.exp, axis = -1).sum()
|
||||||
|
|
||||||
func softmaxDerivative(input: Matrix[float]): Matrix[float] =
|
func softmaxDerivative(input: Matrix[float]): Matrix[float] =
|
||||||
|
# I stole this too, by the way
|
||||||
var input = input.reshape(input.shape.cols, 1)
|
var input = input.reshape(input.shape.cols, 1)
|
||||||
|
# I _love_ stealing functions from numpy!
|
||||||
result = input.diagflat() - input.dot(input.transpose())
|
result = input.diagflat() - input.dot(input.transpose())
|
||||||
|
|
||||||
|
func relu(input: Matrix[float]): Matrix[float] {.used.} = input.apply(proc (x: float): float = max(0.0, x), axis = -1)
|
||||||
|
func dxRelu(input: Matrix[float]): Matrix[float] = input.where(input > 0.0, 0.0)
|
||||||
|
|
||||||
# TODO: Add derivatives for this stuff
|
# TODO: Add derivatives for this stuff
|
||||||
func step(input: Matrix[float]): Matrix[float] {.used.} = input.apply(proc (x: float): float = (if x < 0.0: 0.0 else: x), axis = -1)
|
func step(input: Matrix[float]): Matrix[float] {.used.} = input.apply(proc (x: float): float = (if x < 0.0: 0.0 else: x), axis = -1)
|
||||||
func silu(input: Matrix[float]): Matrix[float] {.used.} = input.apply(proc (x: float): float = 1 / (1 + exp(-x)), axis= -1)
|
func silu(input: Matrix[float]): Matrix[float] {.used.} = input.apply(proc (x: float): float = 1 / (1 + exp(-x)), axis= -1)
|
||||||
func relu(input: Matrix[float]): Matrix[float] {.used.} = input.apply(proc (x: float): float = max(0.0, x), axis = -1)
|
|
||||||
|
|
||||||
func htan(input: Matrix[float]): Matrix[float] {.used.} =
|
func htan(input: Matrix[float]): Matrix[float] {.used.} =
|
||||||
let f = proc (x: float): float =
|
let f = proc (x: float): float =
|
||||||
|
@ -260,6 +299,7 @@ func htan(input: Matrix[float]): Matrix[float] {.used.} =
|
||||||
{.hints: off.} # So nim doesn't complain about the naming
|
{.hints: off.} # So nim doesn't complain about the naming
|
||||||
var Sigmoid* = newActivation(sigmoid, sigmoidDerivative)
|
var Sigmoid* = newActivation(sigmoid, sigmoidDerivative)
|
||||||
var Softmax* = newActivation(softmax, softmaxDerivative)
|
var Softmax* = newActivation(softmax, softmaxDerivative)
|
||||||
|
var ReLU* = newActivation(relu, dxRelu)
|
||||||
var MSE* = newLoss(mse, dxMSE)
|
var MSE* = newLoss(mse, dxMSE)
|
||||||
{.pop.}
|
{.pop.}
|
||||||
|
|
||||||
|
|
|
@ -1079,7 +1079,7 @@ proc count*[T](self: Matrix[T], e: T): int =
|
||||||
proc replace*[T](self: Matrix[T], other: Matrix[T], copy: bool = false) =
|
proc replace*[T](self: Matrix[T], other: Matrix[T], copy: bool = false) =
|
||||||
## Replaces the data in self with the data from
|
## Replaces the data in self with the data from
|
||||||
## other (a copy is not performed unless copy equals
|
## other (a copy is not performed unless copy equals
|
||||||
## true). A reference to the object is returned
|
## true)
|
||||||
if copy:
|
if copy:
|
||||||
self.data[] = other.data[]
|
self.data[] = other.data[]
|
||||||
else:
|
else:
|
||||||
|
|
Loading…
Reference in New Issue