From 53679de09884a1f556b4ae2bd6ed2a4cac99c4b7 Mon Sep 17 00:00:00 2001
From: Mattia Giambirtone <nocturn9x@nocturn9x.space>
Date: Tue, 21 Mar 2023 19:45:23 +0100
Subject: [PATCH] Adapted old utilities from Project Sydney

---
 src/nn/network.nim                       |   2 +-
 src/nn/util/feature_extraction/emoji.nim | 124 +++++++++++
 src/nn/util/feature_extraction/text.nim  | 272 +++++++++++++++++++++++
 src/nn/util/preprocessing.nim            |   4 +-
 4 files changed, 399 insertions(+), 3 deletions(-)
 create mode 100644 src/nn/util/feature_extraction/emoji.nim
 create mode 100644 src/nn/util/feature_extraction/text.nim

diff --git a/src/nn/network.nim b/src/nn/network.nim
index c5643aa..f882dd7 100644
--- a/src/nn/network.nim
+++ b/src/nn/network.nim
@@ -1,4 +1,4 @@
-# Copyright 2022 Mattia Giambirtone & All Contributors
+# Copyright 2023 Mattia Giambirtone & All Contributors
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/src/nn/util/feature_extraction/emoji.nim b/src/nn/util/feature_extraction/emoji.nim
new file mode 100644
index 0000000..8486d94
--- /dev/null
+++ b/src/nn/util/feature_extraction/emoji.nim
@@ -0,0 +1,124 @@
+# Copyright 2023 Mattia Giambirtone & All Contributors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+## Emoji parsing/removal utility 
+## Code refactored and adapted from https://github.com/thecodedmind/nimoj
+
+import json
+import os
+import httpclient
+import tables
+import strutils
+import re
+import strformat
+
+
+const emojiEndpoint = "https://raw.githubusercontent.com/omnidan/node-emoji/master/lib/emoji.json"
+const emojiPath = "emojis.json"
+
+
+type Emojizer* = ref object
+    ## A wrapper that saves
+    ## us parsing the JSON file
+    ## everytime
+    emojiList: JsonNode
+
+
+proc newEmojizer*: Emojizer =
+    ## Returns a new Emojizer object
+    new(result)
+    result.emojiList = newJObject()
+
+
+proc downloadEmojiList =
+    ## Downloads the list of emojis and saves
+    ## it to a JSON file
+    let client = newHttpClient()
+    let list = client.getContent(emojiEndpoint)
+    writeFile(emojiPath, list)
+
+
+proc getEmojiJson(self: Emojizer): JsonNode =
+    ## Returns the JSON object parsed from
+    ## te emoji file
+    if not fileExists(emoji_path):
+        downloadEmojiList()
+    var temp: string
+    temp.toUgly(self.emojiList)
+    if temp == "{}":
+        self.emojiList = parseFile(emojiPath)
+    result = self.emojiList
+
+
+proc findEmoji*(self: Emojizer, part: string): string =
+    ## Searches the emoji dict for emoji names matching 
+    ## search, returns at the first hit
+    for k, v in self.getEmojiJson().getFields().pairs:
+        if part.replace(re":") in k:
+            return v.getStr()
+
+
+proc findEmojis*(self: Emojizer, part: string): seq[string] =
+    ## Searches the emoji dict for emoji names matching 
+    ## search, returns all hits
+    for k, v in self.getEmojiJson().getFields().pairs:
+        if part.replace(re":") in k:
+            result.add(v.getStr())
+
+
+proc findEmojiCodes*(self: Emojizer, part: string): seq[string] =
+    ## Similar to findEmojis, but looks for emoji codes instead
+    ## (such as :happy: or :santa:)
+    for k, v in self.getEmojiJson().getFields().pairs:
+        if part.replace(re":") in k:
+            result.add(k)
+
+
+proc getEmoji*(self: Emojizer, emoji: string, default: string = "unknown_emoji"): string =
+    ## Searches the emoji dict for emojis matching the search
+    return self.getEmojiJson(){emoji.replace(re":")}.getStr(default)
+
+
+proc getEmojiCode*(self: Emojizer, emoji: string): string =
+    ## Searches the emoji dict for emojis matching the search
+    for k, v in self.getEmojiJson().getFields().pairs:
+        if emoji == v.getStr():
+            return k
+
+
+proc emojize*(self: Emojizer, msg: string): string =
+    ## Searches string for emoji tokens, such as `:santa:` 
+    ## and swaps in the emoji character
+    result = msg
+    var emojis = msg.findAll(re":(.+?):")
+    for emoji in emojis:
+        let e = self.getEmoji(emoji)
+        if e != "":
+            result = msg.replace(emoji, e)
+
+
+proc demojize*(self: Emojizer, msg: string, strip: bool): string =
+    ## Searches string for emoji characters.
+    ## If strip is false, this function swaps
+    ## in the emoji token, such as :santa:
+    ## otherwise the emoji is simply removed
+    result = msg
+    for k, v in self.getEmojiJson().getFields().pairs:
+        if v.getStr in result:
+            if strip:
+                result = msg.replace(v.getStr(), "")
+            else:
+                result = msg.replace(v.getStr(), &":{self.getEmojiCode(v.getStr())}:")
+
+
diff --git a/src/nn/util/feature_extraction/text.nim b/src/nn/util/feature_extraction/text.nim
new file mode 100644
index 0000000..6d71638
--- /dev/null
+++ b/src/nn/util/feature_extraction/text.nim
@@ -0,0 +1,272 @@
+# Copyright 2021 Mattia Giambirtone
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Module to extract features from textual datasets
+import strutils
+import sequtils
+import tables
+import math
+import sets
+import normalize
+
+
+import ../matrix
+import emoji
+
+
+type
+    TextPreprocessor* = ref object of RootObj
+        ## A preprocessor for textual datasets.
+        ## Assumes the input is a sequence of
+        ## sentences with space as a separator
+        corpus: HashSet[string]
+        stopwords: HashSet[string]
+        stripPunctuation: bool
+        toLower: bool
+        normalize: bool
+    TextVectorizer* = ref object of RootObj
+        ## A base type for all text vectorizers, defines
+        ## the common interface
+        corpus: HashSet[string]
+        features: HashSet[string]
+        vocab: Table[string, int]
+        maxDf: float
+        minDf: float
+        sublinearTf: bool
+        preprocessor: TextPreprocessor
+    CountVectorizer* = ref object of TextVectorizer
+        ## Vectorizes a textual dataset using word
+        ## counts as weights
+    TFIDFVectorizer* = ref object of TextVectorizer
+        ## Vectorizes a textual dataset using smoothed
+        ## TF-IDF values as weights
+        smoothIdf: bool
+
+
+proc toHashSet[T](m: Matrix[T]): HashSet[T] =
+    result = initHashSet[T]()
+    for row in m:
+        for element in row:
+            result.incl(element)
+    
+
+proc newTextPreprocessor*(stopwords: Matrix[string], stripPunctuation: bool, toLower: bool, normalize: bool): TextPreprocessor =
+    ## Initializes a new TextPreprocessor object
+    new(result)
+    result.stopwords = toHashSet(stopwords)
+    result.stripPunctuation = stripPunctuation
+    result.toLower = toLower
+    result.normalize = normalize
+
+
+proc newTextVectorizer*(preprocessor: TextPreprocessor, minDf, maxDf: float, sublinearTf: bool): TextVectorizer =
+    ## Initializes a new TextVectorizer object
+    new(result)
+    result.minDf = minDf
+    result.maxDf = maxDf
+    result.sublinearTf = sublinearTf
+    result.preprocessor = preprocessor
+
+
+proc newCountVectorizer*(preprocessor: TextPreprocessor, minDf, maxDf: float, sublinearTf: bool): CountVectorizer =
+    ## Initializes a new CountVectorizer object
+    new(result)
+    result.minDf = minDf
+    result.maxDf = maxDf
+    result.sublinearTf = sublinearTf
+    result.preprocessor = preprocessor
+
+
+proc newTFIDFVectorizer*(preprocessor: TextPreprocessor, minDf, maxDf: float, smoothIdf: bool, sublinearTf: bool): TFIDFVectorizer =
+    ## Initializes a new TFIDFVectorizer object
+    new(result)
+    result.minDf = minDf
+    result.maxDf = maxDf
+    result.smooth_idf = smooth_idf
+    result.sublinearTf = sublinearTf
+    result.preprocessor = preprocessor
+
+
+proc fit*(self: TextPreprocessor, corpus: Matrix[string]) =
+    ## Fits the preprocessor to the given corpus
+    self.corpus = toHashSet(corpus)
+
+
+proc transform*(self: TextPreprocessor, x: Matrix[string]): Matrix[string] =
+    ## Tranforms the data in the vector X according to
+    ## the given initialization parameters
+    var res: seq[string] = @[]
+    var stripped: string = ""
+    var emojizer = newEmojizer()
+    for document in x[0]:
+        stripped = document
+        if self.normalize:
+            stripped = emojizer.demojize(stripped, strip=true)
+            stripped = toNFKC(stripped)
+        if self.toLower:
+            stripped = stripped.toLowerAscii()
+        if self.stripPunctuation:
+            stripped = stripped.multiReplace(("'", ""), ("!", ""), ("\"", ""), ("#", ""), ("$", ""), ("%", ""), ("&", ""), ("\\", ""), ("(", ""), (")", ""), ("*", ""), ("+", ""), (",", ""), ("-", ""), (".", ""), ("/", ""), (":", ""), (";", ""), ("<", ""), ("=", ""), (">", ""), ("?", ""), ("@", ""), ("[", ""), ("]", ""), ("_", ""), ("`", ""), ("{", ""), ("|", ""), ("}", ""), ("~", ""))
+        res.add(join(filter(stripped.strip().split(), proc (s: string): bool = s != "" and s notin self.stopwords), " "))
+    result = newMatrix(res)
+
+
+proc transform*(self: TextPreprocessor, x: string): string =
+    ## Tranforms the string X according to
+    ## the given initialization parameters
+    var stripped: string = x
+    var emojizer = newEmojizer()
+    if self.normalize and not isNFKC(stripped):
+        stripped = emojizer.demojize(stripped, strip=true)
+        stripped = toNFKC(stripped)
+    if self.toLower:
+        stripped = stripped.toLowerAscii()
+    if self.stripPunctuation:
+        stripped = stripped.multiReplace(("'", ""), ("!", ""), ("\"", ""), ("#", ""), ("$", ""), ("%", ""), ("&", ""), ("\\", ""), ("(", ""), (")", ""), ("*", ""), ("+", ""), (",", ""), ("-", ""), (".", ""), ("/", ""), (":", ""), (";", ""), ("<", ""), ("=", ""), (">", ""), ("?", ""), ("@", ""), ("[", ""), ("]", ""), ("_", ""), ("`", ""), ("{", ""), ("|", ""), ("}", ""), ("~", ""))
+    result.add(join(filter(stripped.strip().split(), proc (s: string): bool = s != "" and s notin self.stopwords), " "))
+
+
+proc fitTransform*(self: TextPreprocessor, corpus, x: Matrix[string]): Matrix[string] =
+    ## Shorthand for fit() and transform()
+    self.fit(corpus)
+    result = self.transform(x)
+
+
+proc termFrequency(self: TextVectorizer, term, document: string): float =
+    ## Calculates the frequency of a given term
+    ## in our corpus
+
+    result = document.count(term) / document.len()
+    if self.sublinearTf:
+        result = 1 + ln(result)
+
+
+proc documentFrequency(self: TextVectorizer, term: string): float =
+    ## Computes the document frequency of a given term in our corpus
+    var x = 0
+    for document in self.corpus:
+        if term in document:
+           x += 1
+    result = x / self.corpus.len()
+
+
+proc inverseDocumentFrequency(self: TFIDFVectorizer, term: string): float =
+    ## Computes the inverse document frequency of a given term in the corpus
+    var n = self.corpus.len()
+    var df = self.documentFrequency(term)
+    if self.smooth_idf:
+        n += 1
+        df += 1
+    result = ln(float(n) / df) + 1  # This constant addition makes sure that
+    # even words appearing in all documents are not ignored completely
+
+
+proc fit*(self: TextVectorizer, corpus: Matrix[string]) =
+    ## Fits the vectorizer according to the given corpus
+    self.corpus = toHashSet(corpus)
+    var preprocessor = TextPreprocessor(self.preprocessor)
+    preprocessor.fit(corpus)
+    for document in corpus[0]:
+        for word in preprocessor.transform(document).split():
+            self.features.incl(word)
+    # self.vocab is needed to build the matrix later!
+    # It maps a word to the column where it's supposed to go
+    # in our matrix when transform() is called
+    var index: int
+    var df: float
+    self.vocab = initTable[string, int]()
+    for word in self.features:
+        df = self.documentFrequency(word)
+        if df > self.minDf and df < self.maxDf:
+            # Here we exclude words that occur too
+            # frequently and the ones that occur too
+            # rarely to further reduce potential biases.
+            # This can be seen as a way of "detecting" stopwords
+            # beyond the provided ones
+            self.vocab[word] = index
+            index += 1
+
+
+proc getFeatureNames*(self: TextVectorizer): Matrix[string] =
+    ## Returns the list of analyzed features
+    var res: seq[string] = @[]
+    for feature in self.features:
+       res.add(feature)
+    result = newMatrix(res)
+
+
+proc getVocabulary*(self: TextVectorizer): Matrix[string] =
+    ## Returns the vocabulary of the vectorizer
+    var res: seq[string] = @[]
+    for word in self.vocab.keys():
+       res.add(word)
+    result = newMatrix(res)
+
+
+proc count(self: CountVectorizer, word: string): int =
+    ## Counts the occurrences of a word in our corpus
+    for document in self.corpus:
+        result += document.count(word)
+
+
+proc transform*(self: CountVectorizer, x: Matrix[string]): Matrix[float] =
+    ## Transforms the corpus into a bidimensional matrix
+    ## of shape (len(X), len(self.vocab))
+    # We initialize the matrix with zeros!
+    # This is basically a sparse matrix
+    var res = newSeqOfCap[seq[float]](len(x) * len(self.vocab))
+    var x = TextPreprocessor(self.preprocessor).transform(x)
+    for r in 0..<len(x):
+        res.add(@[])
+        for c in 0..<len(self.vocab):
+            res[^1].add(0.0)
+    for i, document in x[0]:
+        for word in document.split(" "):
+            if word in self.vocab:
+                res[i][self.vocab[word]] = float(self.count(word))
+    result = newMatrix(res)
+
+
+proc fitTransform*(self: CountVectorizer, corpus, x: Matrix[string]): Matrix[float] =
+    ## Shorthand for fit() and transform()
+    self.fit(corpus)
+    result = self.transform(x)
+
+
+proc transform*(self: TFIDFVectorizer, x: Matrix[string]): Matrix[float] =
+    ## Transforms the corpus into a bidimensional matrix
+    ## of shape (len(X), len(self.vocab))
+    # We initialize the matrix with zeros!
+    # This is basically a sparse matrix
+    var x = TextPreprocessor(self.preprocessor).transform(x)
+    var tf: float
+    var idf: float
+    var res = newSeqOfCap[seq[float]](len(x) * len(self.vocab))
+    for r in 0..<len(x):
+        res.add(@[])
+        for c in 0..<len(self.vocab):
+            res[^1].add(0.0)
+    for i, document in x[0]:
+        for word in document.split(" "):
+            if word in self.vocab:
+                tf = self.termFrequency(word, document)
+                idf = self.inverseDocumentFrequency(word)
+                res[i][self.vocab[word]] = tf * idf
+    result = newMatrix(res)
+
+
+proc fitTransform*(self: TFIDFVectorizer, corpus, x: Matrix[string]): Matrix[float] =
+    ## Shorthand for fit() and transform()
+    self.fit(corpus)
+    result = self.transform(x)
diff --git a/src/nn/util/preprocessing.nim b/src/nn/util/preprocessing.nim
index be5a706..0528237 100644
--- a/src/nn/util/preprocessing.nim
+++ b/src/nn/util/preprocessing.nim
@@ -1,4 +1,4 @@
-# Copyright 2022 Mattia Giambirtone
+# Copyright 2023 Mattia Giambirtone
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@ type
     LabelEncoder* = ref object
         ## An encoder to assign a numerical value in the
         ## range from 0 to n_labels - 1 to the labels
-        # of some categorical data, reversibly
+        ## of some categorical data, reversibly
         isFit: bool
         labels: Matrix[string]