NNExperiments/src/main.nim

import util/preprocessing
import util/matrix
import util/feature_extraction/text
import nn/network


import strformat
import sequtils
import json
import math
import times


proc loadData: tuple[corpus, results, testset, testResults: Matrix[string]] =
    # Loads the dataset and the testset for the supervised learning test
    let dataset = parseFile("assets/datasets/spamham.json")
    let testset = parseFile("assets/testsets/spamham.json")
    var corpus: seq[string] = @[]
    var results: seq[string] = @[]
    var test: seq[string] = @[]
    var testResults: seq[string] = @[]
    for label in dataset.keys():
        for sentence in dataset[label]:
            corpus.add(sentence.getStr())
            results.add(label)
    for label in testset.keys():
        for sentence in testset[label]:
            test.add(sentence.getStr())
            testResults.add(label)
    result = (corpus: newMatrix(corpus),
              results: newMatrix(results),
              testset: newMatrix(test), 
              testResults: newMatrix(testResults))


proc testMetrics(predictions, y: Matrix[float]): tuple[accuracy, precision, recall: float, tP, tN, fP, fN: int] =
    # Computes the test metrics given the predictions and the 
    # expected results
    var confusion = @[0, 0, 0, 0] # TP, TN, FP, FN
    var success = 0
    var i = 0
    while i < y.len():
        var predicted = predictions[0][i]
        var expected = y[0][i]
        if predicted == expected:
            success += 1
            if predicted == 1.0:
                confusion[0] += 1
            else:
                confusion[1] += 1
        elif predicted == 1.0 and expected == 0.0:
            confusion[2] += 1
        else:
            confusion[3] += 1
    result = (accuracy: success / len(predictions),
              precision: confusion[0] / (confusion[0] + confusion[2]),
              recall: confusion[0] / (confusion[0] + confusion[3]),
              tP: confusion[0], 
              tN: confusion[1], 
              fP: confusion[2],
              fN: confusion[3])


proc main =
    # Runs the test
    const stopwords = @["i", "me", "my", "myself", "we", "our", "ours", 
 "ourselves", "you", "you're", "you've", "you'll", "you'd",
 "your", "yours", "yourself", "yourselves", "he", "him", "his", 
 "himself", "she", "she's", "her", "hers", "herself", "it", "it's",
 "its", "itself", "they", "them", "their", "theirs", "themselves", "what", 
 "which", "who", "whom", "this", "that", "that'll", "these", "those", "am", 
 "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", 
 "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", 
 "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", 
 "with", "about", "against", "between", "into", "through", "during", "before", 
 "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", 
 "off", "over", "under", "again", "further", "then", "once", "here", "there", 
 "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", 
 "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", 
 "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "don't",
 "should", "should've", "now", "d", "ll", "m", "o", "re", "ve", "y", "ain", 
 "aren", "aren't", "couldn", "couldn't", "didn", "didn't", "doesn", 
 "doesn't", "hadn", "hadn't", "hasn", "hasn't", "haven", "haven't", 
 "isn", "isn't", "ma", "mightn", "mightn't", "mustn", "mustn't", "needn", 
 "needn't", "shan", "shan't", "shouldn",  "shouldn't", "wasn", "wasn't", 
 "weren", "weren't", "won", "won't", "wouldn", "wouldn't"]
    const epochs = 10
    const batch = 100
    const inputSize = 512
    let encoder = newLabelEncoder()
    let cleaner = newTextPreprocessor(stopwords=newMatrix(stopwords), toLower=true,
                                      stripPunctuation=true, normalize=false)
    let vectorizer = newTFIDFVectorizer(minDf=0.01, maxDf=0.7, preprocessor=cleaner,
                                        sublinearTf=false, smoothIdf=true)
    var classifier = newNeuralNetwork(@[newDenseLayer(inputSize, 8),
                                        newDenseLayer(8, 16),
                                        newDenseLayer(16, 2)],
                                        lossFunc=MSE,
                                        activationFunc=ReLU,
                                        learnRate=0.1,
                                        momentum=0.3,
                                        weightRange=(-1.0, 1.0),
                                        biasRange=(-1.0, 1.0))
    echo "ProjectSydney v0.2b - Accuracy test"
    echo "\nLoading dataset and testset"
    let loadTime = cpuTime()
    let data = loadData()
    echo &"Data loaded in {cpuTime() - loadTime:.2f} seconds"
    echo "Processing and vectorizing dataset with TF-IDF weigths"
    let vectorTime = cpuTime()
    let xTrain = vectorizer.fitTransform(data.corpus, data.corpus)
    let yTrain = encoder.fitTransform(data.results, data.results)[0]
    echo &"Vectorized in {cpuTime() - vectorTime:.2f} seconds"
    echo &"Feature count: {len(vectorizer.getFeatureNames())}"
    echo &"Vocabulary size: {len(vectorizer.getVocabulary())}"
    echo &"Corpus size: {len(data.corpus)}"
    # let yTest = encoder.transform(data.testResults)
    # let xTest = vectorizer.transform(data.testset)
    var tempData: seq[float] = newSeqOfCap[float](inputSize)
    var trainData: seq[tuple[x, y: Matrix[float]]] = @[]
    # Pad the data to fit into the network
    for i, row in xTrain:
        for e in row:
            if tempData.len() == inputSize:
                break
            tempData.add(e)
        while tempData.len() < inputSize:
            tempData.add(0.0)
        if yTrain[i] == 1:
            trainData.add((newMatrix[float](tempData), newMatrix[float](@[1.0, 0.0])))
        else:
            trainData.add((newMatrix[float](tempData), newMatrix[float](@[0.0, 1.0])))
        tempData.setLen(0)
    echo "Classifier parameters"
    echo &"\tLearn rate: {classifier.learnRate}"
    echo &"\tMomentum: {classifier.momentum}"
    stdout.write("\tNetwork layout: ")
    for i, layer in classifier.layers:
        stdout.write(&"{layer.inputSize}x{layer.outputSize}")
        if i < classifier.layers.high():
            stdout.write(" -> ")
    echo ""
    echo &"Training neural network for {epochs} epochs with batch size of {batch}"
    let trainTime = cpuTime()
    classifier.train(epochs, batch, trainData)
    echo &"Training completed in {cpuTime() - trainTime:.2f} seconds"
    #[echo "\nTest parameters"
    echo &"\tTest size: {len(data.testset)}"
    let testTime = cpuTime()
    let predictions = classifier.fastFeedForward(xTest)
    let metrics = testMetrics(predictions, y_test)
    echo &"\nTest completed in {cpuTime() - testTime:.2f} seconds, metrics below"
    echo &"\tAccuracy: {metrics.accuracy * 100:.2f}%"
    echo &"\tRecall: {metrics.recall:.2f}"
    echo &"\tPrecision: {metrics.precision:.2f}"
    echo &"\tF1-score: {pow((pow(metrics.precision, -1) + pow(metrics.recall, -1)) / 2, -1):.2f}"
    echo "\tConfusion matrix"
    echo &"\t\tTrue positives: {metrics.tP}"
    echo &"\t\tTrue negatives: {metrics.tN}"
    echo &"\t\tFalse negatives: {metrics.fN}"
    echo &"\t\tFalse positives: {metrics.fP}"
    ]#


when isMainModule:
    setControlCHook(proc () {.noconv.} = quit(0))
    main()
Initial work on spam/ham classifier 2023-03-22 11:49:43 +01:00			`import util/preprocessing`
			`import util/matrix`
			`import util/feature_extraction/text`
Added initial work on multilayer perceptron 2022-12-20 12:08:24 +01:00			`import nn/network`

Initial work on tris test 2022-12-23 00:17:57 +01:00
Initial work on spam/ham classifier 2023-03-22 11:49:43 +01:00			`import strformat`
			`import sequtils`
			`import json`
			`import math`
			`import times`


			`proc loadData: tuple[corpus, results, testset, testResults: Matrix[string]] =`
			`# Loads the dataset and the testset for the supervised learning test`
			`let dataset = parseFile("assets/datasets/spamham.json")`
			`let testset = parseFile("assets/testsets/spamham.json")`
			`var corpus: seq[string] = @[]`
			`var results: seq[string] = @[]`
			`var test: seq[string] = @[]`
			`var testResults: seq[string] = @[]`
			`for label in dataset.keys():`
			`for sentence in dataset[label]:`
			`corpus.add(sentence.getStr())`
			`results.add(label)`
			`for label in testset.keys():`
			`for sentence in testset[label]:`
			`test.add(sentence.getStr())`
			`testResults.add(label)`
			`result = (corpus: newMatrix(corpus),`
			`results: newMatrix(results),`
			`testset: newMatrix(test),`
			`testResults: newMatrix(testResults))`


			`proc testMetrics(predictions, y: Matrix[float]): tuple[accuracy, precision, recall: float, tP, tN, fP, fN: int] =`
			`# Computes the test metrics given the predictions and the`
			`# expected results`
			`var confusion = @[0, 0, 0, 0] # TP, TN, FP, FN`
			`var success = 0`
			`var i = 0`
			`while i < y.len():`
			`var predicted = predictions[0][i]`
			`var expected = y[0][i]`
			`if predicted == expected:`
			`success += 1`
			`if predicted == 1.0:`
			`confusion[0] += 1`
			`else:`
			`confusion[1] += 1`
			`elif predicted == 1.0 and expected == 0.0:`
			`confusion[2] += 1`
			`else:`
			`confusion[3] += 1`
			`result = (accuracy: success / len(predictions),`
			`precision: confusion[0] / (confusion[0] + confusion[2]),`
			`recall: confusion[0] / (confusion[0] + confusion[3]),`
			`tP: confusion[0],`
			`tN: confusion[1],`
			`fP: confusion[2],`
			`fN: confusion[3])`


			`proc main =`
			`# Runs the test`
			`const stopwords = @["i", "me", "my", "myself", "we", "our", "ours",`
			`"ourselves", "you", "you're", "you've", "you'll", "you'd",`
			`"your", "yours", "yourself", "yourselves", "he", "him", "his",`
			`"himself", "she", "she's", "her", "hers", "herself", "it", "it's",`
			`"its", "itself", "they", "them", "their", "theirs", "themselves", "what",`
			`"which", "who", "whom", "this", "that", "that'll", "these", "those", "am",`
			`"is", "are", "was", "were", "be", "been", "being", "have", "has", "had",`
			`"having", "do", "does", "did", "doing", "a", "an", "the", "and", "but",`
			`"if", "or", "because", "as", "until", "while", "of", "at", "by", "for",`
			`"with", "about", "against", "between", "into", "through", "during", "before",`
			`"after", "above", "below", "to", "from", "up", "down", "in", "out", "on",`
			`"off", "over", "under", "again", "further", "then", "once", "here", "there",`
			`"when", "where", "why", "how", "all", "any", "both", "each", "few", "more",`
			`"most", "other", "some", "such", "no", "nor", "not", "only", "own", "same",`
			`"so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "don't",`
			`"should", "should've", "now", "d", "ll", "m", "o", "re", "ve", "y", "ain",`
			`"aren", "aren't", "couldn", "couldn't", "didn", "didn't", "doesn",`
			`"doesn't", "hadn", "hadn't", "hasn", "hasn't", "haven", "haven't",`
			`"isn", "isn't", "ma", "mightn", "mightn't", "mustn", "mustn't", "needn",`
			`"needn't", "shan", "shan't", "shouldn", "shouldn't", "wasn", "wasn't",`
			`"weren", "weren't", "won", "won't", "wouldn", "wouldn't"]`
			`const epochs = 10`
			`const batch = 100`
			`const inputSize = 512`
			`let encoder = newLabelEncoder()`
			`let cleaner = newTextPreprocessor(stopwords=newMatrix(stopwords), toLower=true,`
			`stripPunctuation=true, normalize=false)`
			`let vectorizer = newTFIDFVectorizer(minDf=0.01, maxDf=0.7, preprocessor=cleaner,`
			`sublinearTf=false, smoothIdf=true)`
			`var classifier = newNeuralNetwork(@[newDenseLayer(inputSize, 8),`
			`newDenseLayer(8, 16),`
			`newDenseLayer(16, 2)],`
			`lossFunc=MSE,`
			`activationFunc=ReLU,`
			`learnRate=0.1,`
			`momentum=0.3,`
			`weightRange=(-1.0, 1.0),`
			`biasRange=(-1.0, 1.0))`
			`echo "ProjectSydney v0.2b - Accuracy test"`
			`echo "\nLoading dataset and testset"`
			`let loadTime = cpuTime()`
			`let data = loadData()`
			`echo &"Data loaded in {cpuTime() - loadTime:.2f} seconds"`
			`echo "Processing and vectorizing dataset with TF-IDF weigths"`
			`let vectorTime = cpuTime()`
			`let xTrain = vectorizer.fitTransform(data.corpus, data.corpus)`
			`let yTrain = encoder.fitTransform(data.results, data.results)[0]`
			`echo &"Vectorized in {cpuTime() - vectorTime:.2f} seconds"`
			`echo &"Feature count: {len(vectorizer.getFeatureNames())}"`
			`echo &"Vocabulary size: {len(vectorizer.getVocabulary())}"`
			`echo &"Corpus size: {len(data.corpus)}"`
			`# let yTest = encoder.transform(data.testResults)`
			`# let xTest = vectorizer.transform(data.testset)`
			`var tempData: seq[float] = newSeqOfCap[float](inputSize)`
			`var trainData: seq[tuple[x, y: Matrix[float]]] = @[]`
			`# Pad the data to fit into the network`
			`for i, row in xTrain:`
			`for e in row:`
			`if tempData.len() == inputSize:`
			`break`
			`tempData.add(e)`
			`while tempData.len() < inputSize:`
			`tempData.add(0.0)`
			`if yTrain[i] == 1:`
			`trainData.add((newMatrix[float](tempData), newMatrix[float](@[1.0, 0.0])))`
			`else:`
			`trainData.add((newMatrix[float](tempData), newMatrix[float](@[0.0, 1.0])))`
			`tempData.setLen(0)`
			`echo "Classifier parameters"`
			`echo &"\tLearn rate: {classifier.learnRate}"`
			`echo &"\tMomentum: {classifier.momentum}"`
			`stdout.write("\tNetwork layout: ")`
			`for i, layer in classifier.layers:`
			`stdout.write(&"{layer.inputSize}x{layer.outputSize}")`
			`if i < classifier.layers.high():`
			`stdout.write(" -> ")`
			`echo ""`
			`echo &"Training neural network for {epochs} epochs with batch size of {batch}"`
			`let trainTime = cpuTime()`
			`classifier.train(epochs, batch, trainData)`
			`echo &"Training completed in {cpuTime() - trainTime:.2f} seconds"`
			`#[echo "\nTest parameters"`
			`echo &"\tTest size: {len(data.testset)}"`
			`let testTime = cpuTime()`
			`let predictions = classifier.fastFeedForward(xTest)`
			`let metrics = testMetrics(predictions, y_test)`
			`echo &"\nTest completed in {cpuTime() - testTime:.2f} seconds, metrics below"`
			`echo &"\tAccuracy: {metrics.accuracy * 100:.2f}%"`
			`echo &"\tRecall: {metrics.recall:.2f}"`
			`echo &"\tPrecision: {metrics.precision:.2f}"`
			`echo &"\tF1-score: {pow((pow(metrics.precision, -1) + pow(metrics.recall, -1)) / 2, -1):.2f}"`
			`echo "\tConfusion matrix"`
			`echo &"\t\tTrue positives: {metrics.tP}"`
			`echo &"\t\tTrue negatives: {metrics.tN}"`
			`echo &"\t\tFalse negatives: {metrics.fN}"`
			`echo &"\t\tFalse positives: {metrics.fP}"`
			`]#`


			`when isMainModule:`
			`setControlCHook(proc () {.noconv.} = quit(0))`
			`main()`

Simplified the design, minor additions 2023-03-21 18:56:51 +01:00