Added DLTC
This commit is contained in:
commit
7524c38142
|
@ -0,0 +1,11 @@
|
|||
clean:
|
||||
rm -rf dltc/build
|
||||
rm -rf dltc/dist
|
||||
rm -rf dltc/coffeehouse_dltc.egg-info
|
||||
|
||||
build:
|
||||
python3 dltc/setup.py build
|
||||
python3 dltc/setup.py sdist
|
||||
|
||||
install:
|
||||
python3 dltc/setup.py install
|
|
@ -0,0 +1,130 @@
|
|||
# CoffeeHouse DLTC
|
||||
|
||||
CoffeeHouse Deep Learning Classification Engine is a method for creating K2 Models on large data
|
||||
to predict labels from them. For example, you can train the model on a bunch of "Sports" articles
|
||||
and "Political" articles (with the appropriate labels assigned to each article) and train the
|
||||
model. You can give the model a new article that's either "Sports" or "Political" related and
|
||||
the model will be able to predict the likely-hood of the article being Political or Sports related.
|
||||
|
||||
This was forked from [magpie](https://github.com/inspirehep/magpie) but rewritten to handle data
|
||||
and the training process more quickly and efficiently than the original project.
|
||||
|
||||
# Installation
|
||||
|
||||
```shell script
|
||||
python3 setup.py install
|
||||
```
|
||||
|
||||
# Usage
|
||||
|
||||
Create a directory for your model, your directory must contain a model.json file
|
||||
formatted like this
|
||||
|
||||
```json
|
||||
{
|
||||
"model": {
|
||||
"name": "Spam Ham",
|
||||
"model_name": "spam_ham",
|
||||
"author": "Zi Xing",
|
||||
"version": "1.0.0.0",
|
||||
"description": "Model for predicting messages which contains spam or ham"
|
||||
},
|
||||
"training_properties":{
|
||||
"epoch": 35,
|
||||
"vec_dim": 100,
|
||||
"test_ratio": 0.2,
|
||||
"architecture": "cnn",
|
||||
"batch_size": 64
|
||||
},
|
||||
"classification": [
|
||||
{"l": "spam", "f": "spam.dat"},
|
||||
{"l": "ham", "f": "ham.dat"}
|
||||
]
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
### Model
|
||||
|
||||
| Property Name | Description |
|
||||
|---------------|------------------------------------------------------------|
|
||||
| name | The name of the model |
|
||||
| model_nme | The safe name of the model which is used for IO operations |
|
||||
| author | The author which constructed the data for the model |
|
||||
| version | The version of the model |
|
||||
| description | The description of the model, what it does, etc. |
|
||||
|
||||
|
||||
### Training Properties
|
||||
|
||||
| Property Name | Description |
|
||||
|---------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| epoch | The amount of training sessions the model must run through |
|
||||
| vec_dim | The amount of word vector recreations it goes through |
|
||||
| test_ratio | splits data into train & test datasets and evaluates itself after every epoch displaying it's current loss and accuracy. The default value of `test_ratio` is 0 meaning that all the data will be used for training. |
|
||||
| architecture | The type of model to train on, the possible values are `cnn` and `rnn` |
|
||||
| batch_size | The size of the batch for training purposes |
|
||||
|
||||
### Classification
|
||||
|
||||
| Property Name | Description |
|
||||
|---------------|-----------------------------------------------------------------------------|
|
||||
| l | The label for the data, eg; `spam`, `ham`... |
|
||||
| f | The name of the .dat file which consists of the data split into line breaks |
|
||||
|
||||
|
||||
## Training the model
|
||||
|
||||
To train the model, the model must be clustered into a structured directory which will create a
|
||||
bunch of files for the data and labels which would be easier to manage and train the data from
|
||||
those files. In which after the temporary directory will be deleted
|
||||
|
||||
```python
|
||||
from coffeehouse_dltc.chmodel.configuration import Configuration
|
||||
|
||||
# Model directory must contain model.json and the required .dat files
|
||||
configuration = Configuration('<Model Directory>')
|
||||
configuration.train_model()
|
||||
```
|
||||
|
||||
Once this process is done, a output directory will be created with all the generated models
|
||||
|
||||
| File Extension | Description |
|
||||
|----------------|----------------------------------------------|
|
||||
| `.che` | This file contains the word vectors |
|
||||
| `.chs` | File format responsible for the scarler data |
|
||||
| `.chm` | Main classification model |
|
||||
| `.chl` | JSON File format which contains the labels |
|
||||
|
||||
All these files are important in order for the model data to be loaded correctly into memory
|
||||
|
||||
|
||||
## Classifying data
|
||||
|
||||
Assuming the model files has been created, you can load the model cluster and
|
||||
predict from text or file input
|
||||
|
||||
```python
|
||||
from coffeehouse_dltc.main import DLTC
|
||||
|
||||
dltc = DLTC()
|
||||
dltc.load_model_cluster('<Model Directory Output>')
|
||||
|
||||
dltc.predict_from_text("Hello World")
|
||||
# [('ham', 0.9650128), ('spam', 0.040875915)]
|
||||
|
||||
|
||||
dltc.predict_from_file("text.txt")
|
||||
# [('spam', 0.61647576), ('ham', 0.42338383)]
|
||||
```
|
||||
|
||||
|
||||
## From the CLI
|
||||
|
||||
You can access CoffeeHouse-DLTC's features from the command-line interface.
|
||||
|
||||
```shell script
|
||||
python3 -m coffeehouse_dltc --model-info <source directory>
|
||||
python3 -m coffeehouse_dltc --train-model <source directory>
|
||||
python3 -m coffeehouse_dltc --test-model <built model directory>
|
||||
```
|
|
@ -0,0 +1,19 @@
|
|||
from . import main
|
||||
from .main import *
|
||||
|
||||
from . import config
|
||||
from .config import *
|
||||
|
||||
from . import utils
|
||||
from .utils import *
|
||||
|
||||
from . import base
|
||||
from .base import *
|
||||
|
||||
from . import chmodel
|
||||
from .chmodel import *
|
||||
|
||||
from . import nn
|
||||
from .nn import *
|
||||
|
||||
__all__ = ['main', 'base', 'chmodel', 'nn', 'DLTC']
|
|
@ -0,0 +1,130 @@
|
|||
#!/usr/bin/env python
|
||||
from __future__ import unicode_literals
|
||||
from coffeehouse_dltc.chmodel.configuration import Configuration
|
||||
from coffeehouse_dltc.main import DLTC
|
||||
import sys
|
||||
import os
|
||||
|
||||
|
||||
def _real_main(argv=None):
|
||||
"""
|
||||
The main command-line processor
|
||||
|
||||
:param argv:
|
||||
:return:
|
||||
"""
|
||||
if argv[1] == '--help':
|
||||
_help_menu(argv)
|
||||
if argv[1] == '--model-info':
|
||||
_model_info(argv)
|
||||
if argv[1] == '--train-model':
|
||||
_train_model(argv)
|
||||
if argv[1] == '--test-model':
|
||||
_test_model(argv)
|
||||
|
||||
|
||||
def _help_menu(argv=None):
|
||||
"""
|
||||
Displays the help menu and commandline usage
|
||||
|
||||
:param argv:
|
||||
:return:
|
||||
"""
|
||||
print(
|
||||
"CoffeeHouse DLTC CLI\n\n"
|
||||
" --model-info <directory_structure_input>\n"
|
||||
" --train-model <directory_structure_input>\n"
|
||||
" --test-model <model_directory>\n"
|
||||
)
|
||||
sys.exit()
|
||||
|
||||
|
||||
def _test_model(argv=None):
|
||||
"""
|
||||
Tests the model's prediction by allowing user input and displaying the
|
||||
prediction output
|
||||
|
||||
:param argv:
|
||||
:return:
|
||||
"""
|
||||
directory_model_input = os.path.join(os.getcwd(), argv[2])
|
||||
|
||||
if not os.path.exists(directory_model_input):
|
||||
print("\nERROR: The directory '{0}' does not exist".format(directory_model_input))
|
||||
sys.exit()
|
||||
|
||||
print("Loading model")
|
||||
dltc = DLTC()
|
||||
dltc.load_model_cluster(directory_model_input)
|
||||
print("Ready\n")
|
||||
|
||||
while True:
|
||||
input_text = input("> ")
|
||||
print(dltc.predict_from_text(input_text))
|
||||
|
||||
|
||||
def _train_model(argv=None):
|
||||
"""
|
||||
Trains the model from the source directory
|
||||
|
||||
:param argv:
|
||||
:return:
|
||||
"""
|
||||
directory_structure_input = os.path.join(os.getcwd(), argv[2])
|
||||
|
||||
if not os.path.exists(directory_structure_input):
|
||||
print("\nERROR: The directory '{0}' does not exist".format(directory_structure_input))
|
||||
sys.exit()
|
||||
|
||||
configuration = Configuration(directory_structure_input)
|
||||
_model_info(argv)
|
||||
|
||||
print("\n\n----- Model Training Started -----\n")
|
||||
configuration.train_model()
|
||||
|
||||
|
||||
def _model_info(argv=None):
|
||||
"""
|
||||
Displays information about the model and the training configurations
|
||||
|
||||
:param argv:
|
||||
:return:
|
||||
"""
|
||||
directory_structure_input = os.path.join(os.getcwd(), argv[2])
|
||||
|
||||
if not os.path.exists(directory_structure_input):
|
||||
print("\nERROR: The directory '{0}' does not exist".format(directory_structure_input))
|
||||
sys.exit()
|
||||
|
||||
configuration = Configuration(directory_structure_input)
|
||||
print(
|
||||
"\n--- Model Configuration Information ---\n\n"
|
||||
" Name : {0}\n"
|
||||
" Author : {1}\n"
|
||||
" Version : {2}\n"
|
||||
" Description : {3}\n"
|
||||
"---------------------------------------\n"
|
||||
" EPOCH : {4}\n"
|
||||
" VEC_DIM : {5}\n"
|
||||
" TEST_RATIO : {6}\n"
|
||||
" ARCHITECTURE : {7}\n"
|
||||
" BATCH_SIZE : {8}\n"
|
||||
"\n".format(
|
||||
configuration.__name__,
|
||||
configuration.__author__,
|
||||
configuration.__version__,
|
||||
configuration.__description__,
|
||||
configuration.configuration['training_properties']['epoch'],
|
||||
configuration.configuration['training_properties']['vec_dim'],
|
||||
configuration.configuration['training_properties']['test_ratio'],
|
||||
configuration.configuration['training_properties']['architecture'],
|
||||
configuration.configuration['training_properties']['batch_size']
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
_real_main(sys.argv)
|
||||
except KeyboardInterrupt:
|
||||
print('\nERROR: Interrupted by user')
|
|
@ -0,0 +1,53 @@
|
|||
from __future__ import print_function, unicode_literals
|
||||
|
||||
import io
|
||||
import os
|
||||
import nltk
|
||||
import string
|
||||
|
||||
from nltk.tokenize import WordPunctTokenizer, sent_tokenize, word_tokenize
|
||||
|
||||
nltk.download('punkt', quiet=True) # make sure it's downloaded before using
|
||||
|
||||
|
||||
class Document(object):
|
||||
""" Class representing a document that the keywords are extracted from """
|
||||
|
||||
def __init__(self, doc_id, filepath, text=None):
|
||||
self.doc_id = doc_id
|
||||
|
||||
if text:
|
||||
self.text = text
|
||||
self.filename = None
|
||||
self.filepath = None
|
||||
else: # is a path to a file
|
||||
if not os.path.exists(filepath):
|
||||
raise ValueError("The file " + filepath + " doesn't exist")
|
||||
|
||||
self.filepath = filepath
|
||||
self.filename = os.path.basename(filepath)
|
||||
|
||||
with io.open(filepath, 'r', encoding='utf-8') as f:
|
||||
self.text = f.read()
|
||||
|
||||
self.wordset = self.compute_wordset()
|
||||
|
||||
def __str__(self):
|
||||
return self.text
|
||||
|
||||
def compute_wordset(self):
|
||||
tokens = WordPunctTokenizer().tokenize(self.text)
|
||||
lowercase = [t.lower() for t in tokens]
|
||||
return set(lowercase) - {',', '.', '!', ';', ':', '-', '', None}
|
||||
|
||||
def get_all_words(self):
|
||||
""" Return all words tokenized, in lowercase and without punctuation """
|
||||
return [w.lower() for w in word_tokenize(self.text)
|
||||
if w not in string.punctuation]
|
||||
|
||||
def read_sentences(self):
|
||||
lines = self.text.split('\n')
|
||||
raw = [sentence for inner_list in lines
|
||||
for sentence in sent_tokenize(inner_list)]
|
||||
return [[w.lower() for w in word_tokenize(s) if w not in string.punctuation]
|
||||
for s in raw]
|
|
@ -0,0 +1,127 @@
|
|||
from __future__ import print_function, unicode_literals
|
||||
import os
|
||||
import six
|
||||
import numpy as np
|
||||
from functools import reduce
|
||||
|
||||
from gensim.models import Word2Vec
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
|
||||
from coffeehouse_dltc.base.document import Document
|
||||
from coffeehouse_dltc.config import EMBEDDING_SIZE, WORD2VEC_WORKERS, MIN_WORD_COUNT, \
|
||||
WORD2VEC_CONTEXT
|
||||
from coffeehouse_dltc.utils import get_documents, save_to_disk
|
||||
|
||||
|
||||
def train_word2vec_in_memory(docs, vec_dim=EMBEDDING_SIZE):
|
||||
"""
|
||||
Builds word embeddings from documents and return a model
|
||||
:param docs: list of Document objects
|
||||
:param vec_dim: the dimensionality of the vector that's being built
|
||||
|
||||
:return: trained gensim object with word embeddings
|
||||
"""
|
||||
doc_sentences = map(lambda d: d.read_sentences(), docs)
|
||||
all_sentences = reduce(lambda d1, d2: d1 + d2, doc_sentences)
|
||||
|
||||
# Initialize and train the model
|
||||
model = Word2Vec(
|
||||
all_sentences,
|
||||
workers=WORD2VEC_WORKERS,
|
||||
size=vec_dim,
|
||||
min_count=MIN_WORD_COUNT,
|
||||
window=WORD2VEC_CONTEXT,
|
||||
)
|
||||
|
||||
# If you don't plan to train the model any further, calling
|
||||
# init_sims will make the model much more memory-efficient.
|
||||
model.init_sims(replace=True)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def compute_word2vec_for_phrase(phrase, model):
|
||||
"""
|
||||
Compute (add) word embedding for a multiword phrase using a given model
|
||||
:param phrase: unicode, parsed label of a keyphrase
|
||||
:param model: gensim word2vec object
|
||||
|
||||
:return: numpy array
|
||||
"""
|
||||
result = np.zeros(model.vector_size, dtype='float32')
|
||||
for word in phrase.split():
|
||||
if word in model.wv:
|
||||
result += model.wv[word]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def fit_scaler(data_dir, word2vec_model, batch_size=1024, persist_to_path=None):
|
||||
""" Get all the word2vec vectors in a 2D matrix and fit the scaler on it.
|
||||
This scaler can be used afterwards for normalizing feature matrices. """
|
||||
if type(word2vec_model) == str:
|
||||
word2vec_model = Word2Vec.load(word2vec_model)
|
||||
|
||||
doc_generator = get_documents(data_dir)
|
||||
scaler = StandardScaler(copy=False)
|
||||
|
||||
no_more_samples = False
|
||||
while not no_more_samples:
|
||||
batch = []
|
||||
for i in range(batch_size):
|
||||
try:
|
||||
batch.append(six.next(doc_generator))
|
||||
except StopIteration:
|
||||
no_more_samples = True
|
||||
break
|
||||
|
||||
vectors = []
|
||||
for doc in batch:
|
||||
for word in doc.get_all_words():
|
||||
if word in word2vec_model.wv:
|
||||
vectors.append(word2vec_model.wv[word])
|
||||
|
||||
matrix = np.array(vectors)
|
||||
print("Fitted to {} vectors".format(matrix.shape[0]))
|
||||
|
||||
scaler.partial_fit(matrix)
|
||||
|
||||
if persist_to_path:
|
||||
save_to_disk(persist_to_path, scaler)
|
||||
|
||||
return scaler
|
||||
|
||||
|
||||
def train_word2vec(doc_directory, vec_dim=EMBEDDING_SIZE):
|
||||
"""
|
||||
Train the Word2Vec object iteratively, loading stuff to memory one by one.
|
||||
:param doc_directory: directory with the documents
|
||||
:param vec_dim: the dimensionality of the vector that's being built
|
||||
|
||||
:return: Word2Vec object
|
||||
"""
|
||||
class SentenceIterator(object):
|
||||
def __init__(self, dirname):
|
||||
self.dirname = dirname
|
||||
|
||||
def __iter__(self):
|
||||
files = {filename[:-4] for filename in os.listdir(self.dirname)}
|
||||
for doc_id, fname in enumerate(files):
|
||||
d = Document(doc_id, os.path.join(self.dirname, fname + '.txt'))
|
||||
for sentence in d.read_sentences():
|
||||
yield sentence
|
||||
|
||||
# Initialize and train the model
|
||||
model = Word2Vec(
|
||||
SentenceIterator(doc_directory),
|
||||
workers=WORD2VEC_WORKERS,
|
||||
size=vec_dim,
|
||||
min_count=MIN_WORD_COUNT,
|
||||
window=WORD2VEC_CONTEXT,
|
||||
)
|
||||
|
||||
# If you don't plan to train the model any further, calling
|
||||
# init_sims will make the model much more memory-efficient.
|
||||
model.init_sims(replace=True)
|
||||
|
||||
return model
|
|
@ -0,0 +1,188 @@
|
|||
import os
|
||||
import json
|
||||
import shutil
|
||||
from os import path
|
||||
|
||||
from coffeehouse_dltc import DLTC
|
||||
|
||||
|
||||
class Configuration(object):
|
||||
|
||||
def __init__(self, src_directory):
|
||||
"""
|
||||
Public Constructor
|
||||
|
||||
:param src_directory:
|
||||
"""
|
||||
self.src = src_directory
|
||||
if not path.exists(src_directory):
|
||||
raise FileNotFoundError("The source directory '{0}' was not found".
|
||||
format(src_directory))
|
||||
|
||||
self.configuration_file = path.join(self.src, "model.json")
|
||||
if not path.exists(self.configuration_file):
|
||||
raise FileNotFoundError("The file 'model.json' was not found in the source directory")
|
||||
|
||||
with open(self.configuration_file, 'r') as f:
|
||||
self.configuration = json.load(f)
|
||||
|
||||
self.__name__ = self.configuration['model']['name']
|
||||
self.__author__ = self.configuration['model']['author']
|
||||
self.__version__ = self.configuration['model']['version']
|
||||
self.__description__ = self.configuration['model']['description']
|
||||
|
||||
self.classifications = {}
|
||||
for classification_method in self.configuration['classification']:
|
||||
self.classifications[classification_method['l']] = path.join(
|
||||
self.src, classification_method['f']
|
||||
)
|
||||
|
||||
def classifier_range(self, classification_name):
|
||||
"""
|
||||
Determines the range of the classifier
|
||||
|
||||
:param classification_name:
|
||||
:return: Integer of the amount of data the classifier contains
|
||||
"""
|
||||
if classification_name in self.classifications:
|
||||
with open(self.classifications[classification_name], 'r', encoding="utf8") as f:
|
||||
for i, l in enumerate(f):
|
||||
pass
|
||||
return i + 1
|
||||
else:
|
||||
raise ValueError(
|
||||
"The classification label '{0}' is not defined in the configuration".format(
|
||||
classification_name))
|
||||
|
||||
def classifier_contents(self, classification_name):
|
||||
"""
|
||||
Returns the contents of the classifier
|
||||
|
||||
:param classification_name:
|
||||
:return: Contents of the classifier split into a list type
|
||||
"""
|
||||
if classification_name in self.classifications:
|
||||
with open(self.classifications[classification_name], 'r', encoding="utf8") as f:
|
||||
return f.read().splitlines()
|
||||
else:
|
||||
raise ValueError(
|
||||
"The classification label '{0}' is not defined in the configuration".format(
|
||||
classification_name))
|
||||
|
||||
def classifier_labels(self):
|
||||
"""
|
||||
Returns list of labels that this model is configured to use based on the classifier data
|
||||
|
||||
:return: List of labels
|
||||
"""
|
||||
classifier_labels = []
|
||||
for classifier_name, classifier_data_file in self.classifications.items():
|
||||
classifier_labels.append(classifier_name)
|
||||
return classifier_labels
|
||||
|
||||
def create_structure(self):
|
||||
"""
|
||||
Creates the model structure which allows training to be simplified
|
||||
|
||||
:return: the path of the directory containing the model structure
|
||||
"""
|
||||
print("Preparing structure directory")
|
||||
temporary_path = "{0}_data".format(self.src)
|
||||
if path.exists(temporary_path):
|
||||
shutil.rmtree(temporary_path)
|
||||
|
||||
data_path = path.join(temporary_path, "model_data")
|
||||
os.mkdir(temporary_path)
|
||||
print("Created directory '{0}'".format(temporary_path))
|
||||
os.mkdir(data_path)
|
||||
print("Created directory '{0}'".format(data_path))
|
||||
|
||||
labels_file_path = path.join(temporary_path, "model_data.labels")
|
||||
|
||||
with open(labels_file_path, 'w+', encoding='utf8') as f:
|
||||
for item in self.classifier_labels():
|
||||
f.write("%s\n" % item)
|
||||
f.close()
|
||||
|
||||
print("Processing classifiers")
|
||||
for classifier_name, classifier_data_file in self.classifications.items():
|
||||
contents = self.classifier_contents(classifier_name)
|
||||
print("Processing label '{0}'".format(classifier_name))
|
||||
|
||||
current_value = 0
|
||||
for value in contents:
|
||||
content_file_path = "{0}_{1}.txt".format(classifier_name, current_value)
|
||||
label_file_path = "{0}_{1}.lab".format(classifier_name, current_value)
|
||||
with open(path.join(data_path, content_file_path), "w+", encoding="utf8") as content_file:
|
||||
content_file.write(value)
|
||||
content_file.close()
|
||||
with open(path.join(data_path, label_file_path), "w+", encoding="utf8") as label_file:
|
||||
label_file.write(classifier_name)
|
||||
label_file.close()
|
||||
current_value += 1
|
||||
print("Processed label '{0}'".format(classifier_name))
|
||||
|
||||
print("Structure created at '{0}'".format(temporary_path))
|
||||
return temporary_path
|
||||
|
||||
def train_model(self):
|
||||
"""
|
||||
Starts the process of training the model by creating a model structure
|
||||
and creating the necessary models for classification
|
||||
|
||||
:return: None
|
||||
"""
|
||||
directory_structure = self.create_structure()
|
||||
|
||||
print("Preparing output directory")
|
||||
output_path = "{0}_build".format(self.src)
|
||||
|
||||
embeddings_path = path.join(output_path, "{0}.che".format(self.configuration['model']['model_name']))
|
||||
scaler_path = path.join(output_path, "{0}.chs".format(self.configuration['model']['model_name']))
|
||||
model_file_path = path.join(output_path, "{0}.chm".format(self.configuration['model']['model_name']))
|
||||
labels_file_path = path.join(output_path, "{0}.chl".format(self.configuration['model']['model_name']))
|
||||
|
||||
if path.exists(output_path):
|
||||
shutil.rmtree(output_path)
|
||||
|
||||
os.mkdir(output_path)
|
||||
|
||||
print("Initializing CoffeeHouse DLTC Server")
|
||||
# noinspection SpellCheckingInspection
|
||||
dltc = DLTC()
|
||||
|
||||
print("Creating word to vectors model")
|
||||
dltc.train_word2vec(
|
||||
path.join(directory_structure, 'model_data'),
|
||||
vec_dim=self.configuration['training_properties']['vec_dim']
|
||||
)
|
||||
|
||||
print("Fitting Scalers")
|
||||
dltc.fit_scaler(path.join(directory_structure, 'model_data'))
|
||||
|
||||
print("Training model")
|
||||
dltc.train(
|
||||
path.join(directory_structure, 'model_data'),
|
||||
self.classifier_labels(),
|
||||
nn_model=self.configuration['training_properties']['architecture'],
|
||||
batch_size=self.configuration['training_properties']['batch_size'],
|
||||
epochs=self.configuration['training_properties']['epoch'],
|
||||
test_ratio=self.configuration['training_properties']['test_ratio'],
|
||||
verbose=2
|
||||
)
|
||||
|
||||
print("Saving data to disk")
|
||||
dltc.save_word2vec_model(embeddings_path)
|
||||
print("Created file '{0}'".format(embeddings_path))
|
||||
dltc.save_scaler(scaler_path)
|
||||
print("Created file '{0}'".format(scaler_path))
|
||||
dltc.save_model(model_file_path)
|
||||
print("Created file '{0}'".format(model_file_path))
|
||||
with open(labels_file_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(self.classifier_labels(), f, ensure_ascii=False, indent=4)
|
||||
print("Created file '{0}'".format(labels_file_path))
|
||||
|
||||
print("Cleaning up")
|
||||
if path.exists(directory_structure):
|
||||
shutil.rmtree(directory_structure)
|
||||
print("Model created at '{0}".format(output_path))
|
|
@ -0,0 +1,17 @@
|
|||
# word2vec & scaler
|
||||
EMBEDDING_SIZE = 100
|
||||
|
||||
# Cores to use while fitting word2vec vectors
|
||||
WORD2VEC_WORKERS = 4
|
||||
MIN_WORD_COUNT = 5
|
||||
WORD2VEC_CONTEXT = 5
|
||||
|
||||
# Models
|
||||
NN_ARCHITECTURE = 'cnn'
|
||||
|
||||
# Training parameters
|
||||
BATCH_SIZE = 64
|
||||
EPOCHS = 1
|
||||
|
||||
# Number of tokens to save from the abstract, zero padded
|
||||
SAMPLE_LENGTH = 200
|
|
@ -0,0 +1,333 @@
|
|||
from __future__ import unicode_literals, print_function, division
|
||||
|
||||
import math
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
|
||||
import keras.models
|
||||
import numpy as np
|
||||
|
||||
from coffeehouse_dltc.base.document import Document
|
||||
from coffeehouse_dltc.base.word2vec import train_word2vec, fit_scaler
|
||||
from coffeehouse_dltc.config import NN_ARCHITECTURE, BATCH_SIZE, EMBEDDING_SIZE, EPOCHS
|
||||
from coffeehouse_dltc.nn.input_data import get_data_for_model
|
||||
from coffeehouse_dltc.nn.models import get_nn_model
|
||||
from coffeehouse_dltc.utils import save_to_disk, load_from_disk
|
||||
|
||||
|
||||
# noinspection DuplicatedCode
|
||||
class DLTC(object):
|
||||
|
||||
def __init__(self):
|
||||
"""
|
||||
Public Constructor
|
||||
"""
|
||||
self.labels = None
|
||||
self.keras_model = None
|
||||
self.word2vec_model = None
|
||||
self.scaler = None
|
||||
|
||||
def load_model_cluster(self, model_directory):
|
||||
"""
|
||||
Loads the model cluster into memory in which the model can be used
|
||||
to be predicted from
|
||||
|
||||
:param model_directory: The directory which contains the model
|
||||
files such as .che, .chs, .chm and .chl
|
||||
:return: None
|
||||
"""
|
||||
if not os.path.exists(model_directory):
|
||||
raise FileNotFoundError("The model directory does not exist")
|
||||
|
||||
embeddings_path = os.path.join(model_directory, "{0}.che".
|
||||
format(os.path.basename(model_directory[:-6])))
|
||||
scaler_path = os.path.join(model_directory, "{0}.chs".
|
||||
format(os.path.basename(model_directory[:-6])))
|
||||
model_file_path = os.path.join(model_directory, "{0}.chm".
|
||||
format(os.path.basename(model_directory[:-6])))
|
||||
labels_file_path = os.path.join(model_directory, "{0}.chl".
|
||||
format(os.path.basename(model_directory[:-6])))
|
||||
|
||||
if not os.path.exists(embeddings_path):
|
||||
raise FileNotFoundError("The embeddings model was not found ('{0}')".
|
||||
format(embeddings_path))
|
||||
|
||||
if not os.path.exists(scaler_path):
|
||||
raise FileNotFoundError("The scaler model was not found ('{0}')".
|
||||
format(scaler_path))
|
||||
|
||||
if not os.path.exists(model_file_path):
|
||||
raise FileNotFoundError("The classification model was not found ('{0}')".
|
||||
format(model_file_path))
|
||||
|
||||
if not os.path.exists(labels_file_path):
|
||||
raise FileNotFoundError("The labels file was not found ('{0}')".
|
||||
format(labels_file_path))
|
||||
|
||||
# Read the labels file
|
||||
with open(labels_file_path, 'r') as f:
|
||||
self.labels = json.load(f)
|
||||
|
||||
self.load_model(model_file_path)
|
||||
self.load_word2vec_model(embeddings_path)
|
||||
self.load_scaler(scaler_path)
|
||||
|
||||
def train(self, train_dir, vocabulary, test_dir=None, callbacks=None,
|
||||
nn_model=NN_ARCHITECTURE, batch_size=BATCH_SIZE, test_ratio=0.0,
|
||||
epochs=EPOCHS, verbose=1):
|
||||
"""
|
||||
Train the model on given data
|
||||
:param train_dir: directory with data files. Text files should end with
|
||||
'.txt' and corresponding files containing labels should end with '.lab'
|
||||
:param vocabulary: iterable containing all considered labels
|
||||
:param test_dir: directory with test files. They will be used to evaluate
|
||||
the model after every epoch of training.
|
||||
:param callbacks: objects passed to the Keras fit function as callbacks
|
||||
:param nn_model: string defining the NN architecture e.g. 'crnn'
|
||||
:param batch_size: size of one batch
|
||||
:param test_ratio: the ratio of samples that will be withheld from training
|
||||
and used for testing. This can be overridden by test_dir.
|
||||
:param epochs: number of epochs to train
|
||||
:param verbose: 0, 1 or 2. As in Keras.
|
||||
|
||||
:return: History object
|
||||
"""
|
||||
|
||||
if not self.word2vec_model:
|
||||
raise RuntimeError('word2vec model is not trained. ' + 'Run train_word2vec() first.')
|
||||
|
||||
if not self.scaler:
|
||||
raise RuntimeError('The scaler is not trained. ' + 'Run fit_scaler() first.')
|
||||
|
||||
if not os.path.isdir(train_dir):
|
||||
raise ValueError('The training directory ' + train_dir + ' does not exist')
|
||||
|
||||
if test_dir and not os.path.isdir(test_dir):
|
||||
raise ValueError('The test directory ' + test_dir + ' does not exist')
|
||||
|
||||
if self.keras_model:
|
||||
print('WARNING! Overwriting already trained Keras model.', file=sys.stderr)
|
||||
|
||||
self.labels = vocabulary
|
||||
self.keras_model = get_nn_model(
|
||||
nn_model,
|
||||
embedding=self.word2vec_model.vector_size,
|
||||
output_length=len(vocabulary)
|
||||
)
|
||||
|
||||
(x_train, y_train), test_data = get_data_for_model(
|
||||
train_dir,
|
||||
vocabulary,
|
||||
test_dir=test_dir,
|
||||
nn_model=self.keras_model,
|
||||
as_generator=False,
|
||||
batch_size=batch_size,
|
||||
word2vec_model=self.word2vec_model,
|
||||
scaler=self.scaler,
|
||||
)
|
||||
|
||||
return self.keras_model.fit(
|
||||
x_train,
|
||||
y_train,
|
||||
batch_size=batch_size,
|
||||
epochs=epochs,
|
||||
validation_data=test_data,
|
||||
validation_split=test_ratio,
|
||||
callbacks=callbacks or [],
|
||||
verbose=verbose,
|
||||
)
|
||||
|
||||
def batch_train(self, train_dir, vocabulary, test_dir=None, callbacks=None,
|
||||
nn_model=NN_ARCHITECTURE, batch_size=BATCH_SIZE,
|
||||
epochs=EPOCHS, verbose=1):
|
||||
"""
|
||||
Train the model on given data
|
||||
:param train_dir: directory with data files. Text files should end with
|
||||
'.txt' and corresponding files containing labels should end with '.lab'
|
||||
:param vocabulary: iterable containing all considered labels
|
||||
:param test_dir: directory with test files. They will be used to evaluate
|
||||
the model after every epoch of training.
|
||||
:param callbacks: objects passed to the Keras fit function as callbacks
|
||||
:param nn_model: string defining the NN architecture e.g. 'crnn'
|
||||
:param batch_size: size of one batch
|
||||
:param epochs: number of epochs to train
|
||||
:param verbose: 0, 1 or 2. As in Keras.
|
||||
|
||||
:return: History object
|
||||
"""
|
||||
|
||||
if not self.word2vec_model:
|
||||
raise RuntimeError('word2vec model is not trained. ' + 'Run train_word2vec() first.')
|
||||
|
||||
if not self.scaler:
|
||||
raise RuntimeError('The scaler is not trained. ' + 'Run fit_scaler() first.')
|
||||
|
||||
if not os.path.isdir(train_dir):
|
||||
raise ValueError('The training directory ' + train_dir + ' does not exist')
|
||||
|
||||
if test_dir and not os.path.isdir(test_dir):
|
||||
raise ValueError('The test directory ' + test_dir + ' does not exist')
|
||||
|
||||
if self.keras_model:
|
||||
print('WARNING! Overwriting already trained Keras model.', file=sys.stderr)
|
||||
|
||||
self.labels = vocabulary
|
||||
self.keras_model = get_nn_model(
|
||||
nn_model,
|
||||
embedding=self.word2vec_model.vector_size,
|
||||
output_length=len(vocabulary)
|
||||
)
|
||||
|
||||
train_generator, test_data = get_data_for_model(
|
||||
train_dir,
|
||||
vocabulary,
|
||||
test_dir=test_dir,
|
||||
nn_model=self.keras_model,
|
||||
as_generator=True,
|
||||
batch_size=batch_size,
|
||||
word2vec_model=self.word2vec_model,
|
||||
scaler=self.scaler,
|
||||
)
|
||||
|
||||
nb_of_files = len({filename[:-4] for filename in os.listdir(train_dir)})
|
||||
steps_per_epoch = math.ceil(nb_of_files / batch_size)
|
||||
|
||||
return self.keras_model.fit_generator(
|
||||
train_generator,
|
||||
steps_per_epoch=steps_per_epoch,
|
||||
epochs=epochs,
|
||||
validation_data=test_data,
|
||||
callbacks=callbacks or [],
|
||||
verbose=verbose,
|
||||
)
|
||||
|
||||
def predict_from_file(self, filepath):
|
||||
"""
|
||||
Predict labels for a txt file
|
||||
:param filepath: path to the file
|
||||
|
||||
:return: list of labels with corresponding confidence intervals
|
||||
"""
|
||||
doc = Document(0, filepath)
|
||||
return dict(self._predict(doc))
|
||||
|
||||
def predict_from_text(self, text):
|
||||
"""
|
||||
Predict labels for a given string of text
|
||||
:param text: string or unicode with the text
|
||||
:return: list of labels with corresponding confidence intervals
|
||||
"""
|
||||
doc = Document(0, None, text=text)
|
||||
return dict(self._predict(doc))
|
||||
|
||||
def _predict(self, doc):
|
||||
"""
|
||||
Predict labels for a given Document object
|
||||
:param doc: Document object
|
||||
:return: list of labels with corresponding confidence intervals
|
||||
"""
|
||||
if type(self.keras_model.input) == list:
|
||||
_, sample_length, embedding_size = self.keras_model.input_shape[0]
|
||||
else:
|
||||
_, sample_length, embedding_size = self.keras_model.input_shape
|
||||
|
||||
words = doc.get_all_words()[:sample_length]
|
||||
x_matrix = np.zeros((1, sample_length, embedding_size))
|
||||
|
||||
for i, w in enumerate(words):
|
||||
if w in self.word2vec_model.wv:
|
||||
word_vector = self.word2vec_model.wv[w].reshape(1, -1)
|
||||
scaled_vector = self.scaler.transform(word_vector, copy=True)[0]
|
||||
x_matrix[doc.doc_id][i] = scaled_vector
|
||||
|
||||
if type(self.keras_model.input) == list:
|
||||
x = [x_matrix] * len(self.keras_model.input)
|
||||
else:
|
||||
x = [x_matrix]
|
||||
|
||||
y_predicted = self.keras_model.predict(x)
|
||||
|
||||
zipped = zip(self.labels, y_predicted[0])
|
||||
|
||||
return sorted(zipped, key=lambda elem: elem[1], reverse=True)
|
||||
|
||||
def init_word_vectors(self, train_dir, vec_dim=EMBEDDING_SIZE):
|
||||
"""
|
||||
Train word2vec model and fit the scaler afterwards
|
||||
:param train_dir: directory with '.txt' files
|
||||
:param vec_dim: dimensionality of the word vectors
|
||||
|
||||
:return: None
|
||||
"""
|
||||
self.train_word2vec(train_dir, vec_dim=vec_dim)
|
||||
self.fit_scaler(train_dir)
|
||||
|
||||
def train_word2vec(self, train_dir, vec_dim=EMBEDDING_SIZE):
|
||||
"""
|
||||
Train the word2vec model on a directory with text files.
|
||||
:param train_dir: directory with '.txt' files
|
||||
:param vec_dim: dimensionality of the word vectors
|
||||
|
||||
:return: trained gensim model
|
||||
"""
|
||||
if self.word2vec_model:
|
||||
print('WARNING! Overwriting already trained word2vec model.',
|
||||
file=sys.stderr)
|
||||
|
||||
self.word2vec_model = train_word2vec(train_dir, vec_dim=vec_dim)
|
||||
|
||||
return self.word2vec_model
|
||||
|
||||
def fit_scaler(self, train_dir):
|
||||
"""
|
||||
Fit a scaler on given data. Word vectors must be trained already.
|
||||
:param train_dir: directory with '.txt' files
|
||||
|
||||
:return: fitted scaler object
|
||||
"""
|
||||
if not self.word2vec_model:
|
||||
raise ValueError('word2vec model is not trained. Run train_word2vec() first.')
|
||||
|
||||
if self.scaler:
|
||||
print('WARNING! Overwriting already fitted scaler.',
|
||||
file=sys.stderr)
|
||||
|
||||
self.scaler = fit_scaler(train_dir, word2vec_model=self.word2vec_model)
|
||||
|
||||
return self.scaler
|
||||
|
||||
def save_scaler(self, filepath, overwrite=False):
|
||||
""" Save the scaler object to a file """
|
||||
if not self.scaler:
|
||||
raise ValueError("Can't save the scaler, it has not been trained yet")
|
||||
save_to_disk(filepath, self.scaler, overwrite=overwrite)
|
||||
|
||||
def load_scaler(self, filepath):
|
||||
""" Load the scaler object from a file """
|
||||
self.scaler = load_from_disk(filepath)
|
||||
|
||||
def save_word2vec_model(self, filepath, overwrite=False):
|
||||
""" Save the word2vec model to a file """
|
||||
if not self.word2vec_model:
|
||||
raise ValueError("Can't save the word2vec model, it has not been trained yet")
|
||||
save_to_disk(filepath, self.word2vec_model, overwrite=overwrite)
|
||||
|
||||
def load_word2vec_model(self, filepath):
|
||||
""" Load the word2vec model from a file """
|
||||
self.word2vec_model = load_from_disk(filepath)
|
||||
|
||||
def save_model(self, filepath):
|
||||
""" Save the keras NN model to a HDF5 file """
|
||||
if not self.keras_model:
|
||||
raise ValueError("Can't save the model, it has not been trained yet")
|
||||
|
||||
if os.path.exists(filepath):
|
||||
raise ValueError("File " + filepath + " already exists!")
|
||||
self.keras_model.save(filepath)
|
||||
|
||||
def load_model(self, filepath):
|
||||
""" Load the keras NN model from a HDF5 file """
|
||||
if not os.path.exists(filepath):
|
||||
raise ValueError("File " + filepath + " does not exist")
|
||||
self.keras_model = keras.models.load_model(filepath)
|
|
@ -0,0 +1,132 @@
|
|||
from __future__ import unicode_literals, division
|
||||
|
||||
import os
|
||||
import threading
|
||||
|
||||
import numpy as np
|
||||
|
||||
from coffeehouse_dltc.base.document import Document
|
||||
from coffeehouse_dltc.config import BATCH_SIZE, SAMPLE_LENGTH
|
||||
from coffeehouse_dltc.utils import get_answers_for_doc, load_from_disk
|
||||
|
||||
|
||||
def get_data_for_model(train_dir, labels, test_dir=None, nn_model=None,
|
||||
as_generator=False, batch_size=BATCH_SIZE,
|
||||
word2vec_model=None, scaler=None):
|
||||
"""
|
||||
Get data in the form of matrices or generators for both train and test sets.
|
||||
:param train_dir: directory with train files
|
||||
:param labels: an iterable of predefined labels (controlled vocabulary)
|
||||
:param test_dir: directory with test files
|
||||
:param nn_model: Keras model of the NN
|
||||
:param as_generator: flag whether to return a generator or in-memory matrix
|
||||
:param batch_size: integer, size of the batch
|
||||
:param word2vec_model: trained w2v gensim model
|
||||
:param scaler: scaling object for X matrix normalisation e.g. StandardScaler
|
||||
|
||||
:return: tuple with 2 elements for train and test data. Each element can be
|
||||
either a pair of matrices (X, y) or their generator
|
||||
"""
|
||||
|
||||
kwargs = dict(
|
||||
label_indices={lab: i for i, lab in enumerate(labels)},
|
||||
word2vec_model=word2vec_model,
|
||||
scaler=scaler,
|
||||
nn_model=nn_model,
|
||||
)
|
||||
|
||||
if as_generator:
|
||||
filename_it = FilenameIterator(train_dir, batch_size)
|
||||
train_data = iterate_over_batches(filename_it, **kwargs)
|
||||
else:
|
||||
train_files = {filename[:-4] for filename in os.listdir(train_dir)}
|
||||
train_data = build_x_and_y(train_files, train_dir, **kwargs)
|
||||
|
||||
test_data = None
|
||||
if test_dir:
|
||||
test_files = {filename[:-4] for filename in os.listdir(test_dir)}
|
||||
test_data = build_x_and_y(test_files, test_dir, **kwargs)
|
||||
|
||||
return train_data, test_data
|
||||
|
||||
|
||||
def build_x_and_y(filenames, file_directory, **kwargs):
|
||||
"""
|
||||
Given file names and their directory, build (X, y) data matrices
|
||||
:param filenames: iterable of strings showing file ids (no extension)
|
||||
:param file_directory: path to a directory where those files lie
|
||||
:param kwargs: additional necessary data for matrix building e.g. scaler
|
||||
|
||||
:return: a tuple (X, y)
|
||||
"""
|
||||
label_indices = kwargs['label_indices']
|
||||
word2vec_model = kwargs['word2vec_model']
|
||||
scaler = kwargs['scaler']
|
||||
nn_model = kwargs['nn_model']
|
||||
|
||||
x_matrix = np.zeros((len(filenames), SAMPLE_LENGTH, word2vec_model.vector_size))
|
||||
y_matrix = np.zeros((len(filenames), len(label_indices)), dtype=np.bool_)
|
||||
|
||||
for doc_id, fname in enumerate(filenames):
|
||||
doc = Document(doc_id, os.path.join(file_directory, fname + '.txt'))
|
||||
words = doc.get_all_words()[:SAMPLE_LENGTH]
|
||||
|
||||
for i, w in enumerate(words):
|
||||
if w in word2vec_model.wv:
|
||||
word_vector = word2vec_model.wv[w].reshape(1, -1)
|
||||
x_matrix[doc_id][i] = scaler.transform(word_vector, copy=True)[0]
|
||||
|
||||
labels = get_answers_for_doc(
|
||||
fname + '.txt',
|
||||
file_directory,
|
||||
filtered_by=set(label_indices.keys()),
|
||||
)
|
||||
|
||||
for lab in labels:
|
||||
index = label_indices[lab]
|
||||
y_matrix[doc_id][index] = True
|
||||
|
||||
if nn_model and type(nn_model.input) == list:
|
||||
return [x_matrix] * len(nn_model.input), y_matrix
|
||||
else:
|
||||
return [x_matrix], y_matrix
|
||||
|
||||
|
||||
def iterate_over_batches(filename_it, **kwargs):
|
||||
"""
|
||||
Iterate infinitely over a given filename iterator
|
||||
:param filename_it: FilenameIterator object
|
||||
:param kwargs: additional necessary data for matrix building e.g. scaler
|
||||
:return: yields tuples (X, y) when called
|
||||
"""
|
||||
while True:
|
||||
files = filename_it.next()
|
||||
yield build_x_and_y(files, filename_it.dirname, **kwargs)
|
||||
|
||||
|
||||
class FilenameIterator(object):
|
||||
""" A threadsafe iterator yielding a fixed number of filenames from a given
|
||||
folder and looping forever. Can be used for external memory training. """
|
||||
def __init__(self, dirname, batch_size):
|
||||
self.dirname = dirname
|
||||
self.batch_size = batch_size
|
||||
self.lock = threading.Lock()
|
||||
self.files = list({filename[:-4] for filename in os.listdir(dirname)})
|
||||
self.i = 0
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def next(self):
|
||||
with self.lock:
|
||||
|
||||
if self.i == len(self.files):
|
||||
self.i = 0
|
||||
|
||||
batch = self.files[self.i:self.i + self.batch_size]
|
||||
if len(batch) < self.batch_size:
|
||||
self.i = 0
|
||||
else:
|
||||
self.i += self.batch_size
|
||||
|
||||
return batch
|
|
@ -0,0 +1,83 @@
|
|||
from keras.layers import Input, Dense, GRU, Dropout, BatchNormalization, MaxPooling1D, Conv1D, Flatten, Concatenate
|
||||
from keras.models import Model
|
||||
|
||||
from coffeehouse_dltc.config import SAMPLE_LENGTH
|
||||
|
||||
|
||||
def get_nn_model(nn_model, embedding, output_length):
|
||||
if nn_model == 'cnn':
|
||||
return cnn(embedding_size=embedding, output_length=output_length)
|
||||
elif nn_model == 'rnn':
|
||||
return rnn(embedding_size=embedding, output_length=output_length)
|
||||
else:
|
||||
raise ValueError("Unknown NN type: {}".format(nn_model))
|
||||
|
||||
|
||||
# noinspection PyPep8Naming
|
||||
def cnn(embedding_size, output_length):
|
||||
""" Create and return a keras model of a CNN """
|
||||
|
||||
NB_FILTER = 256
|
||||
NGRAM_LENGTHS = [1, 2, 3, 4, 5]
|
||||
|
||||
conv_layers, inputs = [], []
|
||||
|
||||
for ngram_length in NGRAM_LENGTHS:
|
||||
current_input = Input(shape=(SAMPLE_LENGTH, embedding_size))
|
||||
inputs.append(current_input)
|
||||
|
||||
convolution = Conv1D(
|
||||
NB_FILTER,
|
||||
ngram_length,
|
||||
kernel_initializer='lecun_uniform',
|
||||
activation='tanh',
|
||||
)(current_input)
|
||||
|
||||
pool_size = SAMPLE_LENGTH - ngram_length + 1
|
||||
pooling = MaxPooling1D(pool_size=pool_size)(convolution)
|
||||
conv_layers.append(pooling)
|
||||
|
||||
merged = Concatenate()(conv_layers)
|
||||
dropout = Dropout(0.5)(merged)
|
||||
flattened = Flatten()(dropout)
|
||||
outputs = Dense(output_length, activation='sigmoid')(flattened)
|
||||
|
||||
model = Model(inputs=inputs, outputs=outputs)
|
||||
|
||||
model.compile(
|
||||
loss='binary_crossentropy',
|
||||
optimizer='adam',
|
||||
metrics=['top_k_categorical_accuracy'],
|
||||
)
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def rnn(embedding_size, output_length):
|
||||
""" Create and return a keras model of a RNN """
|
||||
# noinspection PyPep8Naming
|
||||
HIDDEN_LAYER_SIZE = 256
|
||||
|
||||
inputs = Input(shape=(SAMPLE_LENGTH, embedding_size))
|
||||
|
||||
gru = GRU(
|
||||
HIDDEN_LAYER_SIZE,
|
||||
input_shape=(SAMPLE_LENGTH, embedding_size),
|
||||
kernel_initializer="glorot_uniform",
|
||||
recurrent_initializer='normal',
|
||||
activation='relu',
|
||||
)(inputs)
|
||||
|
||||
batch_normalization = BatchNormalization()(gru)
|
||||
dropout = Dropout(0.1)(batch_normalization)
|
||||
outputs = Dense(output_length, activation='sigmoid')(dropout)
|
||||
|
||||
model = Model(inputs=inputs, outputs=outputs)
|
||||
|
||||
model.compile(
|
||||
loss='binary_crossentropy',
|
||||
optimizer='adam',
|
||||
metrics=['top_k_categorical_accuracy'],
|
||||
)
|
||||
|
||||
return model
|
|
@ -0,0 +1,171 @@
|
|||
from __future__ import division
|
||||
|
||||
try:
|
||||
# noinspection PyPep8Naming
|
||||
import cPickle as pickle
|
||||
except ImportError:
|
||||
import pickle
|
||||
|
||||
import io
|
||||
import os
|
||||
import random
|
||||
from collections import Counter, defaultdict
|
||||
|
||||
from coffeehouse_dltc.base.document import Document
|
||||
|
||||
|
||||
def save_to_disk(path_to_disk, obj, overwrite=False):
|
||||
""" Pickle an object to disk """
|
||||
dirname = os.path.dirname(path_to_disk)
|
||||
if not os.path.exists(dirname):
|
||||
raise ValueError("Path " + dirname + " does not exist")
|
||||
|
||||
if not overwrite and os.path.exists(path_to_disk):
|
||||
raise ValueError("File " + path_to_disk + "already exists")
|
||||
|
||||
pickle.dump(obj, open(path_to_disk, 'wb'))
|
||||
|
||||
|
||||
def load_from_disk(path_to_disk):
|
||||
""" Load a pickle from disk to memory """
|
||||
if not os.path.exists(path_to_disk):
|
||||
raise ValueError("File " + path_to_disk + " does not exist")
|
||||
|
||||
return pickle.load(open(path_to_disk, 'rb'))
|
||||
|
||||
|
||||
def get_documents(data_dir, as_generator=True, shuffle=False):
|
||||
"""
|
||||
Extract documents from *.txt files in a given directory
|
||||
:param data_dir: path to the directory with .txt files
|
||||
:param as_generator: flag whether to return a document generator or a list
|
||||
:param shuffle: flag whether to return the documents
|
||||
in a shuffled vs sorted order
|
||||
|
||||
:return: generator or a list of Document objects
|
||||
"""
|
||||
files = list({filename[:-4] for filename in os.listdir(data_dir)})
|
||||
files.sort()
|
||||
if shuffle:
|
||||
random.shuffle(files)
|
||||
|
||||
generator = (Document(doc_id, os.path.join(data_dir, f + '.txt'))
|
||||
for doc_id, f in enumerate(files))
|
||||
return generator if as_generator else list(generator)
|
||||
|
||||
|
||||
def get_all_answers(data_dir, filtered_by=None):
|
||||
"""
|
||||
Extract ground truth answers from *.lab files in a given directory
|
||||
:param data_dir: path to the directory with .lab files
|
||||
:param filtered_by: whether to filter the answers.
|
||||
|
||||
:return: dictionary of the form e.g. {'101231': set('lab1', 'lab2') etc.}
|
||||
"""
|
||||
answers = dict()
|
||||
|
||||
files = {filename[:-4] for filename in os.listdir(data_dir)}
|
||||
for f in files:
|
||||
answers[f] = get_answers_for_doc(f + '.txt',
|
||||
data_dir,
|
||||
filtered_by=filtered_by)
|
||||
|
||||
return answers
|
||||
|
||||
|
||||
def get_answers_for_doc(doc_name, data_dir, filtered_by=None):
|
||||
"""
|
||||
Read ground_truth answers from a .lab file corresponding to the doc_name
|
||||
:param doc_name: the name of the document, should end with .txt
|
||||
:param data_dir: directory in which the documents and answer files are
|
||||
:param filtered_by: whether to filter the answers.
|
||||
|
||||
:return: set of unicodes containing answers for this particular document
|
||||
"""
|
||||
filename = os.path.join(data_dir, doc_name[:-4] + '.lab')
|
||||
|
||||
if not os.path.exists(filename):
|
||||
raise ValueError("Answer file " + filename + " does not exist")
|
||||
|
||||
with io.open(filename, 'r') as f:
|
||||
answers = {line.rstrip('\n') for line in f}
|
||||
|
||||
if filtered_by:
|
||||
answers = {kw for kw in answers if kw in filtered_by}
|
||||
|
||||
return answers
|
||||
|
||||
|
||||
def calculate_label_distribution(data_dir, filtered_by=None):
|
||||
"""
|
||||
Calculate the distribution of labels in a directory. Function can be used
|
||||
to find the most frequent and not used labels, so that the target
|
||||
vocabulary can be trimmed accordingly.
|
||||
:param data_dir: directory path with the .lab files
|
||||
:param filtered_by: a set of labels that defines the vocabulary
|
||||
|
||||
:return: list of KV pairs of the form (14, ['lab1', 'lab2']), which means
|
||||
that both lab1 and lab2 were labels in 14 documents
|
||||
"""
|
||||
answers = [kw for v in get_all_answers(data_dir, filtered_by=filtered_by).values()
|
||||
for kw in v]
|
||||
counts = Counter(answers)
|
||||
|
||||
histogram = defaultdict(list)
|
||||
for kw, cnt in counts.items():
|
||||
histogram[cnt].append(kw)
|
||||
|
||||
return histogram
|
||||
|
||||
|
||||
def calculate_number_of_labels_distribution(data_dir, filtered_by=None):
|
||||
""" Look how many papers are there with 3 labels, 4 labels etc.
|
||||
Return a histogram. """
|
||||
answers = get_all_answers(data_dir, filtered_by=filtered_by).values()
|
||||
lengths = [len(ans_set) for ans_set in answers]
|
||||
return Counter(lengths).items()
|
||||
|
||||
|
||||
def get_coverage_ratio_for_label_subset(no_of_labels, hist=None):
|
||||
"""
|
||||
Compute fraction of the samples we would be able to predict, if we reduce
|
||||
the number of labels to a certain subset of the size no_of_labels.
|
||||
:param no_of_labels: the number of labels that we limit the ontology to
|
||||
:param hist: histogram of the samples.
|
||||
Result of calculate_label_distribution function
|
||||
|
||||
:return: number of labels that we need to consider, coverage ratio
|
||||
"""
|
||||
hist = hist or calculate_label_distribution()
|
||||
hist = sorted([(k, len(v)) for k, v in hist.items()])
|
||||
|
||||
total_shots = sum([x[0] * x[1] for x in hist])
|
||||
labels_collected = 0
|
||||
hits_collected = 0
|
||||
for docs, label_count in reversed(hist):
|
||||
hits_collected += docs * label_count
|
||||
labels_collected += label_count
|
||||
if labels_collected >= no_of_labels:
|
||||
return labels_collected, hits_collected / float(total_shots)
|
||||
|
||||
return -1
|
||||
|
||||
|
||||
def get_top_n_labels(n, hist=None):
|
||||
"""
|
||||
Return the n most popular labels
|
||||
:param n: number of labels to return
|
||||
:param hist: histogram, result of calculate_label_distribution() function
|
||||
|
||||
:return: sorted list of strings
|
||||
"""
|
||||
hist = hist or calculate_label_distribution()
|
||||
labels = sorted([(k, v) for k, v in hist.items()], reverse=True)
|
||||
|
||||
answer = []
|
||||
for _count, kws in labels:
|
||||
answer.extend(kws)
|
||||
if len(answer) >= n:
|
||||
break
|
||||
|
||||
return answer[:n]
|
|
@ -0,0 +1,10 @@
|
|||
cython
|
||||
nltk
|
||||
numpy
|
||||
scipy
|
||||
gensim
|
||||
scikit-learn
|
||||
keras
|
||||
h5py
|
||||
tensorflow
|
||||
six
|
|
@ -0,0 +1,36 @@
|
|||
from setuptools import setup, find_packages
|
||||
|
||||
setup(
|
||||
name='coffeehouse_dltc',
|
||||
version='1.0.1',
|
||||
description='Deep Learning Text Classification Engine',
|
||||
url='https://github.com/Intellivoid/CoffeeHouse-DLTC',
|
||||
author='Zi Xing Narrakas',
|
||||
author_email='netkas@intellivoid.info',
|
||||
classifiers=[
|
||||
# 3 - Alpha
|
||||
# 4 - Beta
|
||||
# 5 - Production/Stable
|
||||
'Development Status :: 5 - Production/Stable',
|
||||
'Topic :: Text Processing',
|
||||
'Programming Language :: Python :: 3',
|
||||
],
|
||||
keywords='multi-label classification nlp neural networks deep learning',
|
||||
packages=find_packages(),
|
||||
install_requires=[
|
||||
'nltk',
|
||||
'numpy~=1.17',
|
||||
'scipy~=1.3.1',
|
||||
'gensim~=3.8.0',
|
||||
'scikit-learn==0.22.0',
|
||||
'keras~=2.2.5',
|
||||
'h5py~=2.9',
|
||||
'tensorflow~=1.14.0',
|
||||
'six',
|
||||
'cython'
|
||||
],
|
||||
entry_points='''
|
||||
[console_scripts]
|
||||
coffeehouse_dltc=coffeehouse_dltc.linear_classifier.cli:cli
|
||||
''',
|
||||
)
|
Loading…
Reference in New Issue