NNExperiments/src/main.nim

import util/preprocessing
import util/matrix
import util/feature_extraction/text
import nn/network


import strformat
import sequtils
import json
import math
import times


proc loadData: tuple[corpus, results, testset, testResults: Matrix[string]] =
    # Loads the dataset and the testset for the supervised learning test
    let dataset = parseFile("assets/datasets/spamham.json")
    let testset = parseFile("assets/testsets/spamham.json")
    var corpus: seq[string] = @[]
    var results: seq[string] = @[]
    var test: seq[string] = @[]
    var testResults: seq[string] = @[]
    for label in dataset.keys():
        for sentence in dataset[label]:
            corpus.add(sentence.getStr())
            results.add(label)
    for label in testset.keys():
        for sentence in testset[label]:
            test.add(sentence.getStr())
            testResults.add(label)
    result = (corpus: newMatrix(corpus),
              results: newMatrix(results),
              testset: newMatrix(test),
              testResults: newMatrix(testResults))


proc testMetrics(predictions, y: Matrix[float]): tuple[accuracy, precision, recall: float, tP, tN, fP, fN: int] =
    # Computes the test metrics given the predictions and the
    # expected results
    var confusion = @[0, 0, 0, 0] # TP, TN, FP, FN
    var success = 0
    var i = 0
    while i < y.len():
        var predicted = predictions[0][i]
        var expected = y[0][i]
        if predicted == expected:
            success += 1
            if predicted == 1.0:
                confusion[0] += 1
            else:
                confusion[1] += 1
        elif predicted == 1.0 and expected == 0.0:
            confusion[2] += 1
        else:
            confusion[3] += 1
    result = (accuracy: success / len(predictions),
              precision: confusion[0] / (confusion[0] + confusion[2]),
              recall: confusion[0] / (confusion[0] + confusion[3]),
              tP: confusion[0],
              tN: confusion[1],
              fP: confusion[2],
              fN: confusion[3])


proc main =
    # Runs the test
    const stopwords = @["i", "me", "my", "myself", "we", "our", "ours",
 "ourselves", "you", "you're", "you've", "you'll", "you'd",
 "your", "yours", "yourself", "yourselves", "he", "him", "his",
 "himself", "she", "she's", "her", "hers", "herself", "it", "it's",
 "its", "itself", "they", "them", "their", "theirs", "themselves", "what",
 "which", "who", "whom", "this", "that", "that'll", "these", "those", "am",
 "is", "are", "was", "were", "be", "been", "being", "have", "has", "had",
 "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but",
 "if", "or", "because", "as", "until", "while", "of", "at", "by", "for",
 "with", "about", "against", "between", "into", "through", "during", "before",
 "after", "above", "below", "to", "from", "up", "down", "in", "out", "on",
 "off", "over", "under", "again", "further", "then", "once", "here", "there",
 "when", "where", "why", "how", "all", "any", "both", "each", "few", "more",
 "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same",
 "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "don't",
 "should", "should've", "now", "d", "ll", "m", "o", "re", "ve", "y", "ain",
 "aren", "aren't", "couldn", "couldn't", "didn", "didn't", "doesn",
 "doesn't", "hadn", "hadn't", "hasn", "hasn't", "haven", "haven't",
 "isn", "isn't", "ma", "mightn", "mightn't", "mustn", "mustn't", "needn",
 "needn't", "shan", "shan't", "shouldn",  "shouldn't", "wasn", "wasn't",
 "weren", "weren't", "won", "won't", "wouldn", "wouldn't"]
    const epochs = 10
    const batch = 100
    const inputSize = 512
    let encoder = newLabelEncoder()
    let cleaner = newTextPreprocessor(stopwords=newMatrix(stopwords), toLower=true,
                                      stripPunctuation=true, normalize=false)
    let vectorizer = newTFIDFVectorizer(minDf=0.01, maxDf=0.7, preprocessor=cleaner,
                                        sublinearTf=false, smoothIdf=true)
    var classifier = newNeuralNetwork(@[newDenseLayer(inputSize, 8),
                                        newDenseLayer(8, 16),
                                        newDenseLayer(16, 2)],
                                        lossFunc=MSE,
                                        activationFunc=ReLU,
                                        learnRate=0.1,
                                        momentum=0.3,
                                        weightRange=(-1.0, 1.0),
                                        biasRange=(-1.0, 1.0))
    echo "ProjectSydney v0.2b - Accuracy test"
    echo "\nLoading dataset and testset"
    let loadTime = cpuTime()
    let data = loadData()
    echo &"Data loaded in {cpuTime() - loadTime:.2f} seconds"
    echo "Processing and vectorizing dataset with TF-IDF weigths"
    let vectorTime = cpuTime()
    let xTrain = vectorizer.fitTransform(data.corpus, data.corpus)
    let yTrain = encoder.fitTransform(data.results, data.results)[0]
    echo &"Vectorized in {cpuTime() - vectorTime:.2f} seconds"
    echo &"Feature count: {len(vectorizer.getFeatureNames())}"
    echo &"Vocabulary size: {len(vectorizer.getVocabulary())}"
    echo &"Corpus size: {len(data.corpus)}"
    # let yTest = encoder.transform(data.testResults)
    # let xTest = vectorizer.transform(data.testset)
    var tempData: seq[float] = newSeqOfCap[float](inputSize)
    var trainData: seq[tuple[x, y: Matrix[float]]] = @[]
    # Pad the data to fit into the network
    for i, row in xTrain:
        for e in row:
            if tempData.len() == inputSize:
                break
            tempData.add(e)
        while tempData.len() < inputSize:
            tempData.add(0.0)
        if yTrain[i] == 1:
            trainData.add((newMatrix[float](tempData), newMatrix[float](@[1.0, 0.0])))
        else:
            trainData.add((newMatrix[float](tempData), newMatrix[float](@[0.0, 1.0])))
        tempData.setLen(0)
    echo "Classifier parameters"
    echo &"\tLearn rate: {classifier.learnRate}"
    echo &"\tMomentum: {classifier.momentum}"
    stdout.write("\tNetwork layout: ")
    for i, layer in classifier.layers:
        stdout.write(&"{layer.inputSize}x{layer.outputSize}")
        if i < classifier.layers.high():
            stdout.write(" -> ")
    echo ""
    echo &"Training neural network for {epochs} epochs with batch size of {batch}"
    let trainTime = cpuTime()
    classifier.train(epochs, batch, trainData)
    echo &"Training completed in {cpuTime() - trainTime:.2f} seconds"
    #[echo "\nTest parameters"
    echo &"\tTest size: {len(data.testset)}"
    let testTime = cpuTime()
    let predictions = classifier.fastFeedForward(xTest)
    let metrics = testMetrics(predictions, y_test)
    echo &"\nTest completed in {cpuTime() - testTime:.2f} seconds, metrics below"
    echo &"\tAccuracy: {metrics.accuracy * 100:.2f}%"
    echo &"\tRecall: {metrics.recall:.2f}"
    echo &"\tPrecision: {metrics.precision:.2f}"
    echo &"\tF1-score: {pow((pow(metrics.precision, -1) + pow(metrics.recall, -1)) / 2, -1):.2f}"
    echo "\tConfusion matrix"
    echo &"\t\tTrue positives: {metrics.tP}"
    echo &"\t\tTrue negatives: {metrics.tN}"
    echo &"\t\tFalse negatives: {metrics.fN}"
    echo &"\t\tFalse positives: {metrics.fP}"
    ]#


when isMainModule:
    setControlCHook(proc () {.noconv.} = quit(0))
    main()