# Copyright 2021 Mattia Giambirtone # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Module to extract features from textual datasets import strutils import sequtils import tables import math import sets import normalize import ../matrix import emoji type TextPreprocessor* = ref object of RootObj ## A preprocessor for textual datasets. ## Assumes the input is a sequence of ## sentences with space as a separator corpus: HashSet[string] stopwords: HashSet[string] stripPunctuation: bool toLower: bool normalize: bool TextVectorizer* = ref object of RootObj ## A base type for all text vectorizers, defines ## the common interface corpus: HashSet[string] features: HashSet[string] vocab: Table[string, int] maxDf: float minDf: float sublinearTf: bool preprocessor: TextPreprocessor CountVectorizer* = ref object of TextVectorizer ## Vectorizes a textual dataset using word ## counts as weights TFIDFVectorizer* = ref object of TextVectorizer ## Vectorizes a textual dataset using smoothed ## TF-IDF values as weights smoothIdf: bool proc toHashSet[T](m: Matrix[T]): HashSet[T] = result = initHashSet[T]() for row in m: for element in row: result.incl(element) proc newTextPreprocessor*(stopwords: Matrix[string], stripPunctuation: bool, toLower: bool, normalize: bool): TextPreprocessor = ## Initializes a new TextPreprocessor object new(result) result.stopwords = toHashSet(stopwords) result.stripPunctuation = stripPunctuation result.toLower = toLower result.normalize = normalize proc newTextVectorizer*(preprocessor: TextPreprocessor, minDf, maxDf: float, sublinearTf: bool): TextVectorizer = ## Initializes a new TextVectorizer object new(result) result.minDf = minDf result.maxDf = maxDf result.sublinearTf = sublinearTf result.preprocessor = preprocessor proc newCountVectorizer*(preprocessor: TextPreprocessor, minDf, maxDf: float, sublinearTf: bool): CountVectorizer = ## Initializes a new CountVectorizer object new(result) result.minDf = minDf result.maxDf = maxDf result.sublinearTf = sublinearTf result.preprocessor = preprocessor proc newTFIDFVectorizer*(preprocessor: TextPreprocessor, minDf, maxDf: float, smoothIdf: bool, sublinearTf: bool): TFIDFVectorizer = ## Initializes a new TFIDFVectorizer object new(result) result.minDf = minDf result.maxDf = maxDf result.smooth_idf = smooth_idf result.sublinearTf = sublinearTf result.preprocessor = preprocessor proc fit*(self: TextPreprocessor, corpus: Matrix[string]) = ## Fits the preprocessor to the given corpus self.corpus = toHashSet(corpus) proc transform*(self: TextPreprocessor, x: Matrix[string]): Matrix[string] = ## Tranforms the data in the vector X according to ## the given initialization parameters var res: seq[string] = @[] var stripped: string = "" var emojizer = newEmojizer() for document in x[0]: stripped = document if self.normalize: stripped = emojizer.demojize(stripped, strip=true) stripped = toNFKC(stripped) if self.toLower: stripped = stripped.toLowerAscii() if self.stripPunctuation: stripped = stripped.multiReplace(("'", ""), ("!", ""), ("\"", ""), ("#", ""), ("$", ""), ("%", ""), ("&", ""), ("\\", ""), ("(", ""), (")", ""), ("*", ""), ("+", ""), (",", ""), ("-", ""), (".", ""), ("/", ""), (":", ""), (";", ""), ("<", ""), ("=", ""), (">", ""), ("?", ""), ("@", ""), ("[", ""), ("]", ""), ("_", ""), ("`", ""), ("{", ""), ("|", ""), ("}", ""), ("~", "")) res.add(join(filter(stripped.strip().split(), proc (s: string): bool = s != "" and s notin self.stopwords), " ")) result = newMatrix(res) proc transform*(self: TextPreprocessor, x: string): string = ## Tranforms the string X according to ## the given initialization parameters var stripped: string = x var emojizer = newEmojizer() if self.normalize and not isNFKC(stripped): stripped = emojizer.demojize(stripped, strip=true) stripped = toNFKC(stripped) if self.toLower: stripped = stripped.toLowerAscii() if self.stripPunctuation: stripped = stripped.multiReplace(("'", ""), ("!", ""), ("\"", ""), ("#", ""), ("$", ""), ("%", ""), ("&", ""), ("\\", ""), ("(", ""), (")", ""), ("*", ""), ("+", ""), (",", ""), ("-", ""), (".", ""), ("/", ""), (":", ""), (";", ""), ("<", ""), ("=", ""), (">", ""), ("?", ""), ("@", ""), ("[", ""), ("]", ""), ("_", ""), ("`", ""), ("{", ""), ("|", ""), ("}", ""), ("~", "")) result.add(join(filter(stripped.strip().split(), proc (s: string): bool = s != "" and s notin self.stopwords), " ")) proc fitTransform*(self: TextPreprocessor, corpus, x: Matrix[string]): Matrix[string] = ## Shorthand for fit() and transform() self.fit(corpus) result = self.transform(x) proc termFrequency(self: TextVectorizer, term, document: string): float = ## Calculates the frequency of a given term ## in our corpus result = document.count(term) / document.len() if self.sublinearTf: result = 1 + ln(result) proc documentFrequency(self: TextVectorizer, term: string): float = ## Computes the document frequency of a given term in our corpus var x = 0 for document in self.corpus: if term in document: x += 1 result = x / self.corpus.len() proc inverseDocumentFrequency(self: TFIDFVectorizer, term: string): float = ## Computes the inverse document frequency of a given term in the corpus var n = self.corpus.len() var df = self.documentFrequency(term) if self.smooth_idf: n += 1 df += 1 result = ln(float(n) / df) + 1 # This constant addition makes sure that # even words appearing in all documents are not ignored completely proc fit*(self: TextVectorizer, corpus: Matrix[string]) = ## Fits the vectorizer according to the given corpus self.corpus = toHashSet(corpus) var preprocessor = TextPreprocessor(self.preprocessor) preprocessor.fit(corpus) for document in corpus[0]: for word in preprocessor.transform(document).split(): self.features.incl(word) # self.vocab is needed to build the matrix later! # It maps a word to the column where it's supposed to go # in our matrix when transform() is called var index: int var df: float self.vocab = initTable[string, int]() for word in self.features: df = self.documentFrequency(word) if df > self.minDf and df < self.maxDf: # Here we exclude words that occur too # frequently and the ones that occur too # rarely to further reduce potential biases. # This can be seen as a way of "detecting" stopwords # beyond the provided ones self.vocab[word] = index index += 1 proc getFeatureNames*(self: TextVectorizer): Matrix[string] = ## Returns the list of analyzed features var res: seq[string] = @[] for feature in self.features: res.add(feature) result = newMatrix(res) proc getVocabulary*(self: TextVectorizer): Matrix[string] = ## Returns the vocabulary of the vectorizer var res: seq[string] = @[] for word in self.vocab.keys(): res.add(word) result = newMatrix(res) proc count(self: CountVectorizer, word: string): int = ## Counts the occurrences of a word in our corpus for document in self.corpus: result += document.count(word) proc transform*(self: CountVectorizer, x: Matrix[string]): Matrix[float] = ## Transforms the corpus into a bidimensional matrix ## of shape (len(X), len(self.vocab)) # We initialize the matrix with zeros! # This is basically a sparse matrix var res = newSeqOfCap[seq[float]](len(x) * len(self.vocab)) var x = TextPreprocessor(self.preprocessor).transform(x) for r in 0..