diff --git a/src/main.nim b/src/main.nim
index 007bb0b..7346739 100644
--- a/src/main.nim
+++ b/src/main.nim
@@ -12,16 +12,20 @@ proc mse(a, b: Matrix[float]): float =
 # Derivative of MSE
 func dxMSE*(x, y: Matrix[float]): Matrix[float] = 2.0 * (x - y)
 
-func dx*(x, y: Matrix[float]): Matrix[float] = zeros[float](x.shape)
-
 # A bunch of vectorized activation functions
 func sigmoid*(input: Matrix[float]): Matrix[float] = 
     result = input.apply(proc (x: float): float = 1 / (1 + exp(-x)) , axis = -1)
 
+func sigmoidDerivative*(input: Matrix[float]): Matrix[float] = sigmoid(input) * (1.0 - sigmoid(input))
+
+
 func softmax*(input: Matrix[float]): Matrix[float] = 
     var input = input - input.max()
     result = input.apply(math.exp, axis = -1) / input.apply(math.exp, axis = -1).sum()
 
+func softmaxDerivative*(input: Matrix[float]): Matrix[float] = zeros[float](input.shape)
+
+
 func step*(input: Matrix[float]): Matrix[float] = input.apply(proc (x: float): float = (if x < 0.0: 0.0 else: x), axis = -1)
 func silu*(input: Matrix[float]): Matrix[float] = input.apply(proc (x: float): float = 1 / (1 + exp(-x)), axis= -1)
 func relu*(input: Matrix[float]): Matrix[float] = input.apply(proc (x: float): float = max(0.0, x), axis = -1)
@@ -33,9 +37,10 @@ func htan*(input: Matrix[float]): Matrix[float] =
     input.apply(f, axis = -1)
 
 
-var mlp = newNeuralNetwork(@[newDenseLayer(2, 3, newActivation(sigmoid, dx)), newDenseLayer(3, 2, newActivation(sigmoid, dx)),
-                             newDenseLayer(2, 3, newActivation(softmax, dx))],
-                           lossFunc=newLoss(mse, dxMSE),
-                           learnRate=0.05, weightRange=(start: -1.0, stop: 1.0), biasRange=(start: -10.0, stop: 10.0),
-                           momentum=0.55)
+var mlp = newNeuralNetwork(@[newDenseLayer(2, 3, newActivation(sigmoid, sigmoidDerivative)), 
+                             newDenseLayer(3, 2, newActivation(sigmoid, sigmoidDerivative)),
+                             newDenseLayer(2, 3, newActivation(softmax, softmaxDerivative))],
+                           lossFunc=newLoss(mse, dxMSE), learnRate=0.05, momentum=0.55,
+                           weightRange=(start: -1.0, stop: 1.0), biasRange=(start: -10.0, stop: 10.0))
 echo mlp.feedforward(newMatrix[float](@[1.0, 2.0]))
+
diff --git a/src/nn/network.nim b/src/nn/network.nim
index e66b1b5..861b399 100644
--- a/src/nn/network.nim
+++ b/src/nn/network.nim
@@ -45,7 +45,7 @@ type
     Activation* = ref object
         ## An activation function
         function: proc (input: Matrix[float]): Matrix[float] {.noSideEffect.}
-        derivative: proc (x, y: Matrix[float]): Matrix[float] {.noSideEffect.}
+        derivative: proc (x: Matrix[float]): Matrix[float] {.noSideEffect.}
     Layer* = ref object
         ## A generic neural network
         ## layer
@@ -77,7 +77,7 @@ proc newLoss*(function: proc (a, b: Matrix[float]): float, derivative: proc (x,
     result.derivative = derivative
 
 
-proc newActivation*(function: proc (input: Matrix[float]): Matrix[float] {.noSideEffect.}, derivative: proc (x, y: Matrix[float]): Matrix[float] {.noSideEffect.}): Activation =
+proc newActivation*(function: proc (input: Matrix[float]): Matrix[float] {.noSideEffect.}, derivative: proc (x: Matrix[float]): Matrix[float] {.noSideEffect.}): Activation =
     ## Creates a new Activation object
     new(result)
     result.function = function
diff --git a/src/nn/util/matrix.nim b/src/nn/util/matrix.nim
index a55d304..f84fbed 100644
--- a/src/nn/util/matrix.nim
+++ b/src/nn/util/matrix.nim
@@ -661,18 +661,29 @@ proc `==`*[T](a: Matrix[T], b: MatrixView[T]): Matrix[bool] =
     return a[0] == b
 
 
-proc diag*[T](a: Matrix[T], offset: int = 0): Matrix[T] =
-    ## Returns the diagonal of the given
-    ## matrix starting at the given offset
-    if offset >= a.shape.cols:
-        return newMatrix[T](@[])
-    var current = offset.ind2sub(a.shape)
-    var res = newSeqOfCap[T](a.shape.getSize())
-    while current.row < a.shape.rows and current.col < a.shape.cols:
-        res.add(a.data[a.getIndex(current.row, current.col)])
-        inc(current.row)
-        inc(current.col)
-    result = newMatrix(res)
+proc diag*[T](a: Matrix[T], k: int = 0): Matrix[T] =
+    ## Returns the kth diagonal of 
+    ## the given matrix if a is 2-D 
+    ## or a 2-D matrix with a on its
+    ## kth diagonal if it is 1-D
+    if a.shape.rows > 0:
+        if k >= a.shape.cols:
+            return newMatrix[T](@[])
+        var current = k.ind2sub(a.shape)
+        var res = newSeqOfCap[T](a.shape.getSize())
+        while current.row < a.shape.rows and current.col < a.shape.cols:
+            res.add(a.data[a.getIndex(current.row, current.col)])
+            inc(current.row)
+            inc(current.col)
+        result = newMatrix(res)
+    else:
+        let size = len(a) + k
+        result = zeros[T]((size, size))
+        var current = k.ind2sub(a.shape)
+        for e in a[0]:
+            result[current.row, current.col] = e
+            inc(current.row)
+            inc(current.col)
 
 
 proc fliplr*[T](self: Matrix[T]): Matrix[T] =
@@ -1033,30 +1044,32 @@ when isMainModule:
 
     var m = newMatrix[int](@[@[1, 2, 3], @[4, 5, 6]])
     var k = m.transpose()
-    assert k[2, 1] == m[1, 2], "transpose mismatch"
-    assert all(m.transpose() == k), "transpose mismatch"
-    assert k.sum() == m.sum(), "element sum mismatch"
-    assert all(k.sum(axis=1) == m.sum(axis=0)), "sum over axis mismatch"
-    assert all(k.sum(axis=0) == m.sum(axis=1)), "sum over axis mismatch" 
+    doAssert k[2, 1] == m[1, 2], "transpose mismatch"
+    doAssert all(m.transpose() == k), "transpose mismatch"
+    doAssert k.sum() == m.sum(), "element sum mismatch"
+    doAssert all(k.sum(axis=1) == m.sum(axis=0)), "sum over axis mismatch"
+    doAssert all(k.sum(axis=0) == m.sum(axis=1)), "sum over axis mismatch" 
     var y = newMatrix[int](@[1, 2, 3, 4])
-    assert y.sum() == 10, "element sum mismatch"
-    assert (y + y).sum() == 20, "matrix sum mismatch"
-    assert all(m + m == m * 2), "m + m != m * 2"
+    doAssert y.sum() == 10, "element sum mismatch"
+    doAssert (y + y).sum() == 20, "matrix sum mismatch"
+    doAssert all(m + m == m * 2), "m + m != m * 2"
     var z = newMatrix[int](@[1, 2, 3])
-    assert (m * z).sum() == 46, "matrix multiplication mismatch"
-    assert all(z * z == z.apply(pow, 2, axis = -1, copy=true)), "matrix multiplication mismatch"
+    doAssert (m * z).sum() == 46, "matrix multiplication mismatch"
+    doAssert all(z * z == z.apply(pow, 2, axis = -1, copy=true)), "matrix multiplication mismatch"
     var x = newMatrix[int](@[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
-    assert (x < 5).where(x, x * 10).sum() == 360, "where mismatch"
-    assert all((x < 5).where(x, x * 10) == x.where(x < 5, x * 10)), "where mismatch"
-    assert x.max() == 9, "max mismatch"
-    assert x.argmax() == 10, "argmax mismatch"
-    assert all(newMatrix[int](@[12, 23]).dot(newMatrix[int](@[@[11, 22], @[33, 44]])) == newMatrix[int](@[891, 1276]))
-    assert all(newMatrix[int](@[@[1, 2, 3], @[2, 3, 4]]).dot(newMatrix[int](@[1, 2, 3])) == newMatrix[int](@[14, 20]))
-    assert all(m.diag() == newMatrix[int](@[1, 5]))
-    assert all(m.diag(1) == newMatrix[int](@[2, 6]))
-    assert all(m.diag(2) == newMatrix[int](@[3]))
-    assert m.diag(3).len() == 0
+    doAssert (x < 5).where(x, x * 10).sum() == 360, "where mismatch"
+    doAssert all((x < 5).where(x, x * 10) == x.where(x < 5, x * 10)), "where mismatch"
+    doAssert x.max() == 9, "max mismatch"
+    doAssert x.argmax() == 10, "argmax mismatch"
+    doAssert all(newMatrix[int](@[12, 23]).dot(newMatrix[int](@[@[11, 22], @[33, 44]])) == newMatrix[int](@[891, 1276]))
+    doAssert all(newMatrix[int](@[@[1, 2, 3], @[2, 3, 4]]).dot(newMatrix[int](@[1, 2, 3])) == newMatrix[int](@[14, 20]))
+    doAssert all(m.diag() == newMatrix[int](@[1, 5]))
+    doAssert all(m.diag(1) == newMatrix[int](@[2, 6]))
+    doAssert all(m.diag(2) == newMatrix[int](@[3]))
+    doAssert m.diag(3).len() == 0
     var j = m.fliplr()
-    assert all(j.diag() == newMatrix[int](@[3, 5]))
-    assert all(j.diag(1) == newMatrix[int](@[2, 4]))
-    assert all(j.diag(2) == newMatrix[int](@[1]))
+    doAssert all(j.diag() == newMatrix[int](@[3, 5]))
+    doAssert all(j.diag(1) == newMatrix[int](@[2, 4]))
+    doAssert all(j.diag(2) == newMatrix[int](@[1]))
+    var o = newMatrix[int](@[1, 2, 3])
+    doAssert all(o.diag() == newMatrix[int](@[@[1, 0, 0], @[0, 2, 0], @[0, 0, 3]]))
\ No newline at end of file