2023-03-22 11:49:43 +01:00
|
|
|
import util/preprocessing
|
|
|
|
import util/matrix
|
|
|
|
import util/feature_extraction/text
|
2022-12-20 12:08:24 +01:00
|
|
|
import nn/network
|
|
|
|
|
2022-12-23 00:17:57 +01:00
|
|
|
|
2023-03-22 11:49:43 +01:00
|
|
|
import strformat
|
|
|
|
import sequtils
|
|
|
|
import json
|
|
|
|
import math
|
|
|
|
import times
|
|
|
|
|
|
|
|
|
|
|
|
proc loadData: tuple[corpus, results, testset, testResults: Matrix[string]] =
|
|
|
|
# Loads the dataset and the testset for the supervised learning test
|
|
|
|
let dataset = parseFile("assets/datasets/spamham.json")
|
|
|
|
let testset = parseFile("assets/testsets/spamham.json")
|
|
|
|
var corpus: seq[string] = @[]
|
|
|
|
var results: seq[string] = @[]
|
|
|
|
var test: seq[string] = @[]
|
|
|
|
var testResults: seq[string] = @[]
|
|
|
|
for label in dataset.keys():
|
|
|
|
for sentence in dataset[label]:
|
|
|
|
corpus.add(sentence.getStr())
|
|
|
|
results.add(label)
|
|
|
|
for label in testset.keys():
|
|
|
|
for sentence in testset[label]:
|
|
|
|
test.add(sentence.getStr())
|
|
|
|
testResults.add(label)
|
|
|
|
result = (corpus: newMatrix(corpus),
|
|
|
|
results: newMatrix(results),
|
|
|
|
testset: newMatrix(test),
|
|
|
|
testResults: newMatrix(testResults))
|
|
|
|
|
|
|
|
|
|
|
|
proc testMetrics(predictions, y: Matrix[float]): tuple[accuracy, precision, recall: float, tP, tN, fP, fN: int] =
|
|
|
|
# Computes the test metrics given the predictions and the
|
|
|
|
# expected results
|
|
|
|
var confusion = @[0, 0, 0, 0] # TP, TN, FP, FN
|
|
|
|
var success = 0
|
|
|
|
var i = 0
|
|
|
|
while i < y.len():
|
|
|
|
var predicted = predictions[0][i]
|
|
|
|
var expected = y[0][i]
|
|
|
|
if predicted == expected:
|
|
|
|
success += 1
|
|
|
|
if predicted == 1.0:
|
|
|
|
confusion[0] += 1
|
|
|
|
else:
|
|
|
|
confusion[1] += 1
|
|
|
|
elif predicted == 1.0 and expected == 0.0:
|
|
|
|
confusion[2] += 1
|
|
|
|
else:
|
|
|
|
confusion[3] += 1
|
|
|
|
result = (accuracy: success / len(predictions),
|
|
|
|
precision: confusion[0] / (confusion[0] + confusion[2]),
|
|
|
|
recall: confusion[0] / (confusion[0] + confusion[3]),
|
|
|
|
tP: confusion[0],
|
|
|
|
tN: confusion[1],
|
|
|
|
fP: confusion[2],
|
|
|
|
fN: confusion[3])
|
|
|
|
|
|
|
|
|
|
|
|
proc main =
|
|
|
|
# Runs the test
|
|
|
|
const stopwords = @["i", "me", "my", "myself", "we", "our", "ours",
|
|
|
|
"ourselves", "you", "you're", "you've", "you'll", "you'd",
|
|
|
|
"your", "yours", "yourself", "yourselves", "he", "him", "his",
|
|
|
|
"himself", "she", "she's", "her", "hers", "herself", "it", "it's",
|
|
|
|
"its", "itself", "they", "them", "their", "theirs", "themselves", "what",
|
|
|
|
"which", "who", "whom", "this", "that", "that'll", "these", "those", "am",
|
|
|
|
"is", "are", "was", "were", "be", "been", "being", "have", "has", "had",
|
|
|
|
"having", "do", "does", "did", "doing", "a", "an", "the", "and", "but",
|
|
|
|
"if", "or", "because", "as", "until", "while", "of", "at", "by", "for",
|
|
|
|
"with", "about", "against", "between", "into", "through", "during", "before",
|
|
|
|
"after", "above", "below", "to", "from", "up", "down", "in", "out", "on",
|
|
|
|
"off", "over", "under", "again", "further", "then", "once", "here", "there",
|
|
|
|
"when", "where", "why", "how", "all", "any", "both", "each", "few", "more",
|
|
|
|
"most", "other", "some", "such", "no", "nor", "not", "only", "own", "same",
|
|
|
|
"so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "don't",
|
|
|
|
"should", "should've", "now", "d", "ll", "m", "o", "re", "ve", "y", "ain",
|
|
|
|
"aren", "aren't", "couldn", "couldn't", "didn", "didn't", "doesn",
|
|
|
|
"doesn't", "hadn", "hadn't", "hasn", "hasn't", "haven", "haven't",
|
|
|
|
"isn", "isn't", "ma", "mightn", "mightn't", "mustn", "mustn't", "needn",
|
|
|
|
"needn't", "shan", "shan't", "shouldn", "shouldn't", "wasn", "wasn't",
|
|
|
|
"weren", "weren't", "won", "won't", "wouldn", "wouldn't"]
|
|
|
|
const epochs = 10
|
|
|
|
const batch = 100
|
|
|
|
const inputSize = 512
|
|
|
|
let encoder = newLabelEncoder()
|
|
|
|
let cleaner = newTextPreprocessor(stopwords=newMatrix(stopwords), toLower=true,
|
|
|
|
stripPunctuation=true, normalize=false)
|
|
|
|
let vectorizer = newTFIDFVectorizer(minDf=0.01, maxDf=0.7, preprocessor=cleaner,
|
|
|
|
sublinearTf=false, smoothIdf=true)
|
|
|
|
var classifier = newNeuralNetwork(@[newDenseLayer(inputSize, 8),
|
|
|
|
newDenseLayer(8, 16),
|
|
|
|
newDenseLayer(16, 2)],
|
|
|
|
lossFunc=MSE,
|
|
|
|
activationFunc=ReLU,
|
|
|
|
learnRate=0.1,
|
|
|
|
momentum=0.3,
|
|
|
|
weightRange=(-1.0, 1.0),
|
|
|
|
biasRange=(-1.0, 1.0))
|
|
|
|
echo "ProjectSydney v0.2b - Accuracy test"
|
|
|
|
echo "\nLoading dataset and testset"
|
|
|
|
let loadTime = cpuTime()
|
|
|
|
let data = loadData()
|
|
|
|
echo &"Data loaded in {cpuTime() - loadTime:.2f} seconds"
|
|
|
|
echo "Processing and vectorizing dataset with TF-IDF weigths"
|
|
|
|
let vectorTime = cpuTime()
|
|
|
|
let xTrain = vectorizer.fitTransform(data.corpus, data.corpus)
|
|
|
|
let yTrain = encoder.fitTransform(data.results, data.results)[0]
|
|
|
|
echo &"Vectorized in {cpuTime() - vectorTime:.2f} seconds"
|
|
|
|
echo &"Feature count: {len(vectorizer.getFeatureNames())}"
|
|
|
|
echo &"Vocabulary size: {len(vectorizer.getVocabulary())}"
|
|
|
|
echo &"Corpus size: {len(data.corpus)}"
|
|
|
|
# let yTest = encoder.transform(data.testResults)
|
|
|
|
# let xTest = vectorizer.transform(data.testset)
|
|
|
|
var tempData: seq[float] = newSeqOfCap[float](inputSize)
|
|
|
|
var trainData: seq[tuple[x, y: Matrix[float]]] = @[]
|
|
|
|
# Pad the data to fit into the network
|
|
|
|
for i, row in xTrain:
|
|
|
|
for e in row:
|
|
|
|
if tempData.len() == inputSize:
|
|
|
|
break
|
|
|
|
tempData.add(e)
|
|
|
|
while tempData.len() < inputSize:
|
|
|
|
tempData.add(0.0)
|
|
|
|
if yTrain[i] == 1:
|
|
|
|
trainData.add((newMatrix[float](tempData), newMatrix[float](@[1.0, 0.0])))
|
|
|
|
else:
|
|
|
|
trainData.add((newMatrix[float](tempData), newMatrix[float](@[0.0, 1.0])))
|
|
|
|
tempData.setLen(0)
|
|
|
|
echo "Classifier parameters"
|
|
|
|
echo &"\tLearn rate: {classifier.learnRate}"
|
|
|
|
echo &"\tMomentum: {classifier.momentum}"
|
|
|
|
stdout.write("\tNetwork layout: ")
|
|
|
|
for i, layer in classifier.layers:
|
|
|
|
stdout.write(&"{layer.inputSize}x{layer.outputSize}")
|
|
|
|
if i < classifier.layers.high():
|
|
|
|
stdout.write(" -> ")
|
|
|
|
echo ""
|
|
|
|
echo &"Training neural network for {epochs} epochs with batch size of {batch}"
|
|
|
|
let trainTime = cpuTime()
|
|
|
|
classifier.train(epochs, batch, trainData)
|
|
|
|
echo &"Training completed in {cpuTime() - trainTime:.2f} seconds"
|
|
|
|
#[echo "\nTest parameters"
|
|
|
|
echo &"\tTest size: {len(data.testset)}"
|
|
|
|
let testTime = cpuTime()
|
|
|
|
let predictions = classifier.fastFeedForward(xTest)
|
|
|
|
let metrics = testMetrics(predictions, y_test)
|
|
|
|
echo &"\nTest completed in {cpuTime() - testTime:.2f} seconds, metrics below"
|
|
|
|
echo &"\tAccuracy: {metrics.accuracy * 100:.2f}%"
|
|
|
|
echo &"\tRecall: {metrics.recall:.2f}"
|
|
|
|
echo &"\tPrecision: {metrics.precision:.2f}"
|
|
|
|
echo &"\tF1-score: {pow((pow(metrics.precision, -1) + pow(metrics.recall, -1)) / 2, -1):.2f}"
|
|
|
|
echo "\tConfusion matrix"
|
|
|
|
echo &"\t\tTrue positives: {metrics.tP}"
|
|
|
|
echo &"\t\tTrue negatives: {metrics.tN}"
|
|
|
|
echo &"\t\tFalse negatives: {metrics.fN}"
|
|
|
|
echo &"\t\tFalse positives: {metrics.fP}"
|
|
|
|
]#
|
|
|
|
|
|
|
|
|
|
|
|
when isMainModule:
|
|
|
|
setControlCHook(proc () {.noconv.} = quit(0))
|
|
|
|
main()
|
|
|
|
|
2023-03-21 18:56:51 +01:00
|
|
|
|