NNExperiments/src/main.nim

170 lines
7.3 KiB
Nim

import util/preprocessing
import util/matrix
import util/feature_extraction/text
import nn/network
import strformat
import sequtils
import json
import math
import times
proc loadData: tuple[corpus, results, testset, testResults: Matrix[string]] =
# Loads the dataset and the testset for the supervised learning test
let dataset = parseFile("assets/datasets/spamham.json")
let testset = parseFile("assets/testsets/spamham.json")
var corpus: seq[string] = @[]
var results: seq[string] = @[]
var test: seq[string] = @[]
var testResults: seq[string] = @[]
for label in dataset.keys():
for sentence in dataset[label]:
corpus.add(sentence.getStr())
results.add(label)
for label in testset.keys():
for sentence in testset[label]:
test.add(sentence.getStr())
testResults.add(label)
result = (corpus: newMatrix(corpus),
results: newMatrix(results),
testset: newMatrix(test),
testResults: newMatrix(testResults))
proc testMetrics(predictions, y: Matrix[float]): tuple[accuracy, precision, recall: float, tP, tN, fP, fN: int] =
# Computes the test metrics given the predictions and the
# expected results
var confusion = @[0, 0, 0, 0] # TP, TN, FP, FN
var success = 0
var i = 0
while i < y.len():
var predicted = predictions[0][i]
var expected = y[0][i]
if predicted == expected:
success += 1
if predicted == 1.0:
confusion[0] += 1
else:
confusion[1] += 1
elif predicted == 1.0 and expected == 0.0:
confusion[2] += 1
else:
confusion[3] += 1
result = (accuracy: success / len(predictions),
precision: confusion[0] / (confusion[0] + confusion[2]),
recall: confusion[0] / (confusion[0] + confusion[3]),
tP: confusion[0],
tN: confusion[1],
fP: confusion[2],
fN: confusion[3])
proc main =
# Runs the test
const stopwords = @["i", "me", "my", "myself", "we", "our", "ours",
"ourselves", "you", "you're", "you've", "you'll", "you'd",
"your", "yours", "yourself", "yourselves", "he", "him", "his",
"himself", "she", "she's", "her", "hers", "herself", "it", "it's",
"its", "itself", "they", "them", "their", "theirs", "themselves", "what",
"which", "who", "whom", "this", "that", "that'll", "these", "those", "am",
"is", "are", "was", "were", "be", "been", "being", "have", "has", "had",
"having", "do", "does", "did", "doing", "a", "an", "the", "and", "but",
"if", "or", "because", "as", "until", "while", "of", "at", "by", "for",
"with", "about", "against", "between", "into", "through", "during", "before",
"after", "above", "below", "to", "from", "up", "down", "in", "out", "on",
"off", "over", "under", "again", "further", "then", "once", "here", "there",
"when", "where", "why", "how", "all", "any", "both", "each", "few", "more",
"most", "other", "some", "such", "no", "nor", "not", "only", "own", "same",
"so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "don't",
"should", "should've", "now", "d", "ll", "m", "o", "re", "ve", "y", "ain",
"aren", "aren't", "couldn", "couldn't", "didn", "didn't", "doesn",
"doesn't", "hadn", "hadn't", "hasn", "hasn't", "haven", "haven't",
"isn", "isn't", "ma", "mightn", "mightn't", "mustn", "mustn't", "needn",
"needn't", "shan", "shan't", "shouldn", "shouldn't", "wasn", "wasn't",
"weren", "weren't", "won", "won't", "wouldn", "wouldn't"]
const epochs = 10
const batch = 100
const inputSize = 512
let encoder = newLabelEncoder()
let cleaner = newTextPreprocessor(stopwords=newMatrix(stopwords), toLower=true,
stripPunctuation=true, normalize=false)
let vectorizer = newTFIDFVectorizer(minDf=0.01, maxDf=0.7, preprocessor=cleaner,
sublinearTf=false, smoothIdf=true)
var classifier = newNeuralNetwork(@[newDenseLayer(inputSize, 8),
newDenseLayer(8, 16),
newDenseLayer(16, 2)],
lossFunc=MSE,
activationFunc=ReLU,
learnRate=0.1,
momentum=0.3,
weightRange=(-1.0, 1.0),
biasRange=(-1.0, 1.0))
echo "ProjectSydney v0.2b - Accuracy test"
echo "\nLoading dataset and testset"
let loadTime = cpuTime()
let data = loadData()
echo &"Data loaded in {cpuTime() - loadTime:.2f} seconds"
echo "Processing and vectorizing dataset with TF-IDF weigths"
let vectorTime = cpuTime()
let xTrain = vectorizer.fitTransform(data.corpus, data.corpus)
let yTrain = encoder.fitTransform(data.results, data.results)[0]
echo &"Vectorized in {cpuTime() - vectorTime:.2f} seconds"
echo &"Feature count: {len(vectorizer.getFeatureNames())}"
echo &"Vocabulary size: {len(vectorizer.getVocabulary())}"
echo &"Corpus size: {len(data.corpus)}"
# let yTest = encoder.transform(data.testResults)
# let xTest = vectorizer.transform(data.testset)
var tempData: seq[float] = newSeqOfCap[float](inputSize)
var trainData: seq[tuple[x, y: Matrix[float]]] = @[]
# Pad the data to fit into the network
for i, row in xTrain:
for e in row:
if tempData.len() == inputSize:
break
tempData.add(e)
while tempData.len() < inputSize:
tempData.add(0.0)
if yTrain[i] == 1:
trainData.add((newMatrix[float](tempData), newMatrix[float](@[1.0, 0.0])))
else:
trainData.add((newMatrix[float](tempData), newMatrix[float](@[0.0, 1.0])))
tempData.setLen(0)
echo "Classifier parameters"
echo &"\tLearn rate: {classifier.learnRate}"
echo &"\tMomentum: {classifier.momentum}"
stdout.write("\tNetwork layout: ")
for i, layer in classifier.layers:
stdout.write(&"{layer.inputSize}x{layer.outputSize}")
if i < classifier.layers.high():
stdout.write(" -> ")
echo ""
echo &"Training neural network for {epochs} epochs with batch size of {batch}"
let trainTime = cpuTime()
classifier.train(epochs, batch, trainData)
echo &"Training completed in {cpuTime() - trainTime:.2f} seconds"
#[echo "\nTest parameters"
echo &"\tTest size: {len(data.testset)}"
let testTime = cpuTime()
let predictions = classifier.fastFeedForward(xTest)
let metrics = testMetrics(predictions, y_test)
echo &"\nTest completed in {cpuTime() - testTime:.2f} seconds, metrics below"
echo &"\tAccuracy: {metrics.accuracy * 100:.2f}%"
echo &"\tRecall: {metrics.recall:.2f}"
echo &"\tPrecision: {metrics.precision:.2f}"
echo &"\tF1-score: {pow((pow(metrics.precision, -1) + pow(metrics.recall, -1)) / 2, -1):.2f}"
echo "\tConfusion matrix"
echo &"\t\tTrue positives: {metrics.tP}"
echo &"\t\tTrue negatives: {metrics.tN}"
echo &"\t\tFalse negatives: {metrics.fN}"
echo &"\t\tFalse positives: {metrics.fP}"
]#
when isMainModule:
setControlCHook(proc () {.noconv.} = quit(0))
main()