From 53679de09884a1f556b4ae2bd6ed2a4cac99c4b7 Mon Sep 17 00:00:00 2001 From: Mattia Giambirtone Date: Tue, 21 Mar 2023 19:45:23 +0100 Subject: [PATCH] Adapted old utilities from Project Sydney --- src/nn/network.nim | 2 +- src/nn/util/feature_extraction/emoji.nim | 124 +++++++++++ src/nn/util/feature_extraction/text.nim | 272 +++++++++++++++++++++++ src/nn/util/preprocessing.nim | 4 +- 4 files changed, 399 insertions(+), 3 deletions(-) create mode 100644 src/nn/util/feature_extraction/emoji.nim create mode 100644 src/nn/util/feature_extraction/text.nim diff --git a/src/nn/network.nim b/src/nn/network.nim index c5643aa..f882dd7 100644 --- a/src/nn/network.nim +++ b/src/nn/network.nim @@ -1,4 +1,4 @@ -# Copyright 2022 Mattia Giambirtone & All Contributors +# Copyright 2023 Mattia Giambirtone & All Contributors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/src/nn/util/feature_extraction/emoji.nim b/src/nn/util/feature_extraction/emoji.nim new file mode 100644 index 0000000..8486d94 --- /dev/null +++ b/src/nn/util/feature_extraction/emoji.nim @@ -0,0 +1,124 @@ +# Copyright 2023 Mattia Giambirtone & All Contributors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +## Emoji parsing/removal utility +## Code refactored and adapted from https://github.com/thecodedmind/nimoj + +import json +import os +import httpclient +import tables +import strutils +import re +import strformat + + +const emojiEndpoint = "https://raw.githubusercontent.com/omnidan/node-emoji/master/lib/emoji.json" +const emojiPath = "emojis.json" + + +type Emojizer* = ref object + ## A wrapper that saves + ## us parsing the JSON file + ## everytime + emojiList: JsonNode + + +proc newEmojizer*: Emojizer = + ## Returns a new Emojizer object + new(result) + result.emojiList = newJObject() + + +proc downloadEmojiList = + ## Downloads the list of emojis and saves + ## it to a JSON file + let client = newHttpClient() + let list = client.getContent(emojiEndpoint) + writeFile(emojiPath, list) + + +proc getEmojiJson(self: Emojizer): JsonNode = + ## Returns the JSON object parsed from + ## te emoji file + if not fileExists(emoji_path): + downloadEmojiList() + var temp: string + temp.toUgly(self.emojiList) + if temp == "{}": + self.emojiList = parseFile(emojiPath) + result = self.emojiList + + +proc findEmoji*(self: Emojizer, part: string): string = + ## Searches the emoji dict for emoji names matching + ## search, returns at the first hit + for k, v in self.getEmojiJson().getFields().pairs: + if part.replace(re":") in k: + return v.getStr() + + +proc findEmojis*(self: Emojizer, part: string): seq[string] = + ## Searches the emoji dict for emoji names matching + ## search, returns all hits + for k, v in self.getEmojiJson().getFields().pairs: + if part.replace(re":") in k: + result.add(v.getStr()) + + +proc findEmojiCodes*(self: Emojizer, part: string): seq[string] = + ## Similar to findEmojis, but looks for emoji codes instead + ## (such as :happy: or :santa:) + for k, v in self.getEmojiJson().getFields().pairs: + if part.replace(re":") in k: + result.add(k) + + +proc getEmoji*(self: Emojizer, emoji: string, default: string = "unknown_emoji"): string = + ## Searches the emoji dict for emojis matching the search + return self.getEmojiJson(){emoji.replace(re":")}.getStr(default) + + +proc getEmojiCode*(self: Emojizer, emoji: string): string = + ## Searches the emoji dict for emojis matching the search + for k, v in self.getEmojiJson().getFields().pairs: + if emoji == v.getStr(): + return k + + +proc emojize*(self: Emojizer, msg: string): string = + ## Searches string for emoji tokens, such as `:santa:` + ## and swaps in the emoji character + result = msg + var emojis = msg.findAll(re":(.+?):") + for emoji in emojis: + let e = self.getEmoji(emoji) + if e != "": + result = msg.replace(emoji, e) + + +proc demojize*(self: Emojizer, msg: string, strip: bool): string = + ## Searches string for emoji characters. + ## If strip is false, this function swaps + ## in the emoji token, such as :santa: + ## otherwise the emoji is simply removed + result = msg + for k, v in self.getEmojiJson().getFields().pairs: + if v.getStr in result: + if strip: + result = msg.replace(v.getStr(), "") + else: + result = msg.replace(v.getStr(), &":{self.getEmojiCode(v.getStr())}:") + + diff --git a/src/nn/util/feature_extraction/text.nim b/src/nn/util/feature_extraction/text.nim new file mode 100644 index 0000000..6d71638 --- /dev/null +++ b/src/nn/util/feature_extraction/text.nim @@ -0,0 +1,272 @@ +# Copyright 2021 Mattia Giambirtone +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Module to extract features from textual datasets +import strutils +import sequtils +import tables +import math +import sets +import normalize + + +import ../matrix +import emoji + + +type + TextPreprocessor* = ref object of RootObj + ## A preprocessor for textual datasets. + ## Assumes the input is a sequence of + ## sentences with space as a separator + corpus: HashSet[string] + stopwords: HashSet[string] + stripPunctuation: bool + toLower: bool + normalize: bool + TextVectorizer* = ref object of RootObj + ## A base type for all text vectorizers, defines + ## the common interface + corpus: HashSet[string] + features: HashSet[string] + vocab: Table[string, int] + maxDf: float + minDf: float + sublinearTf: bool + preprocessor: TextPreprocessor + CountVectorizer* = ref object of TextVectorizer + ## Vectorizes a textual dataset using word + ## counts as weights + TFIDFVectorizer* = ref object of TextVectorizer + ## Vectorizes a textual dataset using smoothed + ## TF-IDF values as weights + smoothIdf: bool + + +proc toHashSet[T](m: Matrix[T]): HashSet[T] = + result = initHashSet[T]() + for row in m: + for element in row: + result.incl(element) + + +proc newTextPreprocessor*(stopwords: Matrix[string], stripPunctuation: bool, toLower: bool, normalize: bool): TextPreprocessor = + ## Initializes a new TextPreprocessor object + new(result) + result.stopwords = toHashSet(stopwords) + result.stripPunctuation = stripPunctuation + result.toLower = toLower + result.normalize = normalize + + +proc newTextVectorizer*(preprocessor: TextPreprocessor, minDf, maxDf: float, sublinearTf: bool): TextVectorizer = + ## Initializes a new TextVectorizer object + new(result) + result.minDf = minDf + result.maxDf = maxDf + result.sublinearTf = sublinearTf + result.preprocessor = preprocessor + + +proc newCountVectorizer*(preprocessor: TextPreprocessor, minDf, maxDf: float, sublinearTf: bool): CountVectorizer = + ## Initializes a new CountVectorizer object + new(result) + result.minDf = minDf + result.maxDf = maxDf + result.sublinearTf = sublinearTf + result.preprocessor = preprocessor + + +proc newTFIDFVectorizer*(preprocessor: TextPreprocessor, minDf, maxDf: float, smoothIdf: bool, sublinearTf: bool): TFIDFVectorizer = + ## Initializes a new TFIDFVectorizer object + new(result) + result.minDf = minDf + result.maxDf = maxDf + result.smooth_idf = smooth_idf + result.sublinearTf = sublinearTf + result.preprocessor = preprocessor + + +proc fit*(self: TextPreprocessor, corpus: Matrix[string]) = + ## Fits the preprocessor to the given corpus + self.corpus = toHashSet(corpus) + + +proc transform*(self: TextPreprocessor, x: Matrix[string]): Matrix[string] = + ## Tranforms the data in the vector X according to + ## the given initialization parameters + var res: seq[string] = @[] + var stripped: string = "" + var emojizer = newEmojizer() + for document in x[0]: + stripped = document + if self.normalize: + stripped = emojizer.demojize(stripped, strip=true) + stripped = toNFKC(stripped) + if self.toLower: + stripped = stripped.toLowerAscii() + if self.stripPunctuation: + stripped = stripped.multiReplace(("'", ""), ("!", ""), ("\"", ""), ("#", ""), ("$", ""), ("%", ""), ("&", ""), ("\\", ""), ("(", ""), (")", ""), ("*", ""), ("+", ""), (",", ""), ("-", ""), (".", ""), ("/", ""), (":", ""), (";", ""), ("<", ""), ("=", ""), (">", ""), ("?", ""), ("@", ""), ("[", ""), ("]", ""), ("_", ""), ("`", ""), ("{", ""), ("|", ""), ("}", ""), ("~", "")) + res.add(join(filter(stripped.strip().split(), proc (s: string): bool = s != "" and s notin self.stopwords), " ")) + result = newMatrix(res) + + +proc transform*(self: TextPreprocessor, x: string): string = + ## Tranforms the string X according to + ## the given initialization parameters + var stripped: string = x + var emojizer = newEmojizer() + if self.normalize and not isNFKC(stripped): + stripped = emojizer.demojize(stripped, strip=true) + stripped = toNFKC(stripped) + if self.toLower: + stripped = stripped.toLowerAscii() + if self.stripPunctuation: + stripped = stripped.multiReplace(("'", ""), ("!", ""), ("\"", ""), ("#", ""), ("$", ""), ("%", ""), ("&", ""), ("\\", ""), ("(", ""), (")", ""), ("*", ""), ("+", ""), (",", ""), ("-", ""), (".", ""), ("/", ""), (":", ""), (";", ""), ("<", ""), ("=", ""), (">", ""), ("?", ""), ("@", ""), ("[", ""), ("]", ""), ("_", ""), ("`", ""), ("{", ""), ("|", ""), ("}", ""), ("~", "")) + result.add(join(filter(stripped.strip().split(), proc (s: string): bool = s != "" and s notin self.stopwords), " ")) + + +proc fitTransform*(self: TextPreprocessor, corpus, x: Matrix[string]): Matrix[string] = + ## Shorthand for fit() and transform() + self.fit(corpus) + result = self.transform(x) + + +proc termFrequency(self: TextVectorizer, term, document: string): float = + ## Calculates the frequency of a given term + ## in our corpus + + result = document.count(term) / document.len() + if self.sublinearTf: + result = 1 + ln(result) + + +proc documentFrequency(self: TextVectorizer, term: string): float = + ## Computes the document frequency of a given term in our corpus + var x = 0 + for document in self.corpus: + if term in document: + x += 1 + result = x / self.corpus.len() + + +proc inverseDocumentFrequency(self: TFIDFVectorizer, term: string): float = + ## Computes the inverse document frequency of a given term in the corpus + var n = self.corpus.len() + var df = self.documentFrequency(term) + if self.smooth_idf: + n += 1 + df += 1 + result = ln(float(n) / df) + 1 # This constant addition makes sure that + # even words appearing in all documents are not ignored completely + + +proc fit*(self: TextVectorizer, corpus: Matrix[string]) = + ## Fits the vectorizer according to the given corpus + self.corpus = toHashSet(corpus) + var preprocessor = TextPreprocessor(self.preprocessor) + preprocessor.fit(corpus) + for document in corpus[0]: + for word in preprocessor.transform(document).split(): + self.features.incl(word) + # self.vocab is needed to build the matrix later! + # It maps a word to the column where it's supposed to go + # in our matrix when transform() is called + var index: int + var df: float + self.vocab = initTable[string, int]() + for word in self.features: + df = self.documentFrequency(word) + if df > self.minDf and df < self.maxDf: + # Here we exclude words that occur too + # frequently and the ones that occur too + # rarely to further reduce potential biases. + # This can be seen as a way of "detecting" stopwords + # beyond the provided ones + self.vocab[word] = index + index += 1 + + +proc getFeatureNames*(self: TextVectorizer): Matrix[string] = + ## Returns the list of analyzed features + var res: seq[string] = @[] + for feature in self.features: + res.add(feature) + result = newMatrix(res) + + +proc getVocabulary*(self: TextVectorizer): Matrix[string] = + ## Returns the vocabulary of the vectorizer + var res: seq[string] = @[] + for word in self.vocab.keys(): + res.add(word) + result = newMatrix(res) + + +proc count(self: CountVectorizer, word: string): int = + ## Counts the occurrences of a word in our corpus + for document in self.corpus: + result += document.count(word) + + +proc transform*(self: CountVectorizer, x: Matrix[string]): Matrix[float] = + ## Transforms the corpus into a bidimensional matrix + ## of shape (len(X), len(self.vocab)) + # We initialize the matrix with zeros! + # This is basically a sparse matrix + var res = newSeqOfCap[seq[float]](len(x) * len(self.vocab)) + var x = TextPreprocessor(self.preprocessor).transform(x) + for r in 0..