import util/preprocessing import util/matrix import util/feature_extraction/text import nn/network import strformat import sequtils import json import math import times proc loadData: tuple[corpus, results, testset, testResults: Matrix[string]] = # Loads the dataset and the testset for the supervised learning test let dataset = parseFile("assets/datasets/spamham.json") let testset = parseFile("assets/testsets/spamham.json") var corpus: seq[string] = @[] var results: seq[string] = @[] var test: seq[string] = @[] var testResults: seq[string] = @[] for label in dataset.keys(): for sentence in dataset[label]: corpus.add(sentence.getStr()) results.add(label) for label in testset.keys(): for sentence in testset[label]: test.add(sentence.getStr()) testResults.add(label) result = (corpus: newMatrix(corpus), results: newMatrix(results), testset: newMatrix(test), testResults: newMatrix(testResults)) proc testMetrics(predictions, y: Matrix[float]): tuple[accuracy, precision, recall: float, tP, tN, fP, fN: int] = # Computes the test metrics given the predictions and the # expected results var confusion = @[0, 0, 0, 0] # TP, TN, FP, FN var success = 0 var i = 0 while i < y.len(): var predicted = predictions[0][i] var expected = y[0][i] if predicted == expected: success += 1 if predicted == 1.0: confusion[0] += 1 else: confusion[1] += 1 elif predicted == 1.0 and expected == 0.0: confusion[2] += 1 else: confusion[3] += 1 result = (accuracy: success / len(predictions), precision: confusion[0] / (confusion[0] + confusion[2]), recall: confusion[0] / (confusion[0] + confusion[3]), tP: confusion[0], tN: confusion[1], fP: confusion[2], fN: confusion[3]) proc main = # Runs the test const stopwords = @["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "you're", "you've", "you'll", "you'd", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "she's", "her", "hers", "herself", "it", "it's", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "that'll", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "don't", "should", "should've", "now", "d", "ll", "m", "o", "re", "ve", "y", "ain", "aren", "aren't", "couldn", "couldn't", "didn", "didn't", "doesn", "doesn't", "hadn", "hadn't", "hasn", "hasn't", "haven", "haven't", "isn", "isn't", "ma", "mightn", "mightn't", "mustn", "mustn't", "needn", "needn't", "shan", "shan't", "shouldn", "shouldn't", "wasn", "wasn't", "weren", "weren't", "won", "won't", "wouldn", "wouldn't"] const epochs = 10 const batch = 100 const inputSize = 512 let encoder = newLabelEncoder() let cleaner = newTextPreprocessor(stopwords=newMatrix(stopwords), toLower=true, stripPunctuation=true, normalize=false) let vectorizer = newTFIDFVectorizer(minDf=0.01, maxDf=0.7, preprocessor=cleaner, sublinearTf=false, smoothIdf=true) var classifier = newNeuralNetwork(@[newDenseLayer(inputSize, 8), newDenseLayer(8, 16), newDenseLayer(16, 2)], lossFunc=MSE, activationFunc=ReLU, learnRate=0.1, momentum=0.3, weightRange=(-1.0, 1.0), biasRange=(-1.0, 1.0)) echo "ProjectSydney v0.2b - Accuracy test" echo "\nLoading dataset and testset" let loadTime = cpuTime() let data = loadData() echo &"Data loaded in {cpuTime() - loadTime:.2f} seconds" echo "Processing and vectorizing dataset with TF-IDF weigths" let vectorTime = cpuTime() let xTrain = vectorizer.fitTransform(data.corpus, data.corpus) let yTrain = encoder.fitTransform(data.results, data.results)[0] echo &"Vectorized in {cpuTime() - vectorTime:.2f} seconds" echo &"Feature count: {len(vectorizer.getFeatureNames())}" echo &"Vocabulary size: {len(vectorizer.getVocabulary())}" echo &"Corpus size: {len(data.corpus)}" # let yTest = encoder.transform(data.testResults) # let xTest = vectorizer.transform(data.testset) var tempData: seq[float] = newSeqOfCap[float](inputSize) var trainData: seq[tuple[x, y: Matrix[float]]] = @[] # Pad the data to fit into the network for i, row in xTrain: for e in row: if tempData.len() == inputSize: break tempData.add(e) while tempData.len() < inputSize: tempData.add(0.0) if yTrain[i] == 1: trainData.add((newMatrix[float](tempData), newMatrix[float](@[1.0, 0.0]))) else: trainData.add((newMatrix[float](tempData), newMatrix[float](@[0.0, 1.0]))) tempData.setLen(0) echo "Classifier parameters" echo &"\tLearn rate: {classifier.learnRate}" echo &"\tMomentum: {classifier.momentum}" stdout.write("\tNetwork layout: ") for i, layer in classifier.layers: stdout.write(&"{layer.inputSize}x{layer.outputSize}") if i < classifier.layers.high(): stdout.write(" -> ") echo "" echo &"Training neural network for {epochs} epochs with batch size of {batch}" let trainTime = cpuTime() classifier.train(epochs, batch, trainData) echo &"Training completed in {cpuTime() - trainTime:.2f} seconds" #[echo "\nTest parameters" echo &"\tTest size: {len(data.testset)}" let testTime = cpuTime() let predictions = classifier.fastFeedForward(xTest) let metrics = testMetrics(predictions, y_test) echo &"\nTest completed in {cpuTime() - testTime:.2f} seconds, metrics below" echo &"\tAccuracy: {metrics.accuracy * 100:.2f}%" echo &"\tRecall: {metrics.recall:.2f}" echo &"\tPrecision: {metrics.precision:.2f}" echo &"\tF1-score: {pow((pow(metrics.precision, -1) + pow(metrics.recall, -1)) / 2, -1):.2f}" echo "\tConfusion matrix" echo &"\t\tTrue positives: {metrics.tP}" echo &"\t\tTrue negatives: {metrics.tN}" echo &"\t\tFalse negatives: {metrics.fN}" echo &"\t\tFalse positives: {metrics.fP}" ]# when isMainModule: setControlCHook(proc () {.noconv.} = quit(0)) main()