CofeehousePy/dltc/coffeehouse_dltc/main.py

334 lines
12 KiB
Python

from __future__ import unicode_literals, print_function, division
import math
import os
import sys
import json
import keras.models
import numpy as np
from coffeehouse_dltc.base.document import Document
from coffeehouse_dltc.base.word2vec import train_word2vec, fit_scaler
from coffeehouse_dltc.config import NN_ARCHITECTURE, BATCH_SIZE, EMBEDDING_SIZE, EPOCHS
from coffeehouse_dltc.nn.input_data import get_data_for_model
from coffeehouse_dltc.nn.models import get_nn_model
from coffeehouse_dltc.utils import save_to_disk, load_from_disk
# noinspection DuplicatedCode
class DLTC(object):
def __init__(self):
"""
Public Constructor
"""
self.labels = None
self.keras_model = None
self.word2vec_model = None
self.scaler = None
def load_model_cluster(self, model_directory):
"""
Loads the model cluster into memory in which the model can be used
to be predicted from
:param model_directory: The directory which contains the model
files such as .che, .chs, .chm and .chl
:return: None
"""
if not os.path.exists(model_directory):
raise FileNotFoundError("The model directory does not exist")
embeddings_path = os.path.join(model_directory, "{0}.che".
format(os.path.basename(model_directory[:-6])))
scaler_path = os.path.join(model_directory, "{0}.chs".
format(os.path.basename(model_directory[:-6])))
model_file_path = os.path.join(model_directory, "{0}.chm".
format(os.path.basename(model_directory[:-6])))
labels_file_path = os.path.join(model_directory, "{0}.chl".
format(os.path.basename(model_directory[:-6])))
if not os.path.exists(embeddings_path):
raise FileNotFoundError("The embeddings model was not found ('{0}')".
format(embeddings_path))
if not os.path.exists(scaler_path):
raise FileNotFoundError("The scaler model was not found ('{0}')".
format(scaler_path))
if not os.path.exists(model_file_path):
raise FileNotFoundError("The classification model was not found ('{0}')".
format(model_file_path))
if not os.path.exists(labels_file_path):
raise FileNotFoundError("The labels file was not found ('{0}')".
format(labels_file_path))
# Read the labels file
with open(labels_file_path, 'r') as f:
self.labels = json.load(f)
self.load_model(model_file_path)
self.load_word2vec_model(embeddings_path)
self.load_scaler(scaler_path)
def train(self, train_dir, vocabulary, test_dir=None, callbacks=None,
nn_model=NN_ARCHITECTURE, batch_size=BATCH_SIZE, test_ratio=0.0,
epochs=EPOCHS, verbose=1):
"""
Train the model on given data
:param train_dir: directory with data files. Text files should end with
'.txt' and corresponding files containing labels should end with '.lab'
:param vocabulary: iterable containing all considered labels
:param test_dir: directory with test files. They will be used to evaluate
the model after every epoch of training.
:param callbacks: objects passed to the Keras fit function as callbacks
:param nn_model: string defining the NN architecture e.g. 'crnn'
:param batch_size: size of one batch
:param test_ratio: the ratio of samples that will be withheld from training
and used for testing. This can be overridden by test_dir.
:param epochs: number of epochs to train
:param verbose: 0, 1 or 2. As in Keras.
:return: History object
"""
if not self.word2vec_model:
raise RuntimeError('word2vec model is not trained. ' + 'Run train_word2vec() first.')
if not self.scaler:
raise RuntimeError('The scaler is not trained. ' + 'Run fit_scaler() first.')
if not os.path.isdir(train_dir):
raise ValueError('The training directory ' + train_dir + ' does not exist')
if test_dir and not os.path.isdir(test_dir):
raise ValueError('The test directory ' + test_dir + ' does not exist')
if self.keras_model:
print('WARNING! Overwriting already trained Keras model.', file=sys.stderr)
self.labels = vocabulary
self.keras_model = get_nn_model(
nn_model,
embedding=self.word2vec_model.vector_size,
output_length=len(vocabulary)
)
(x_train, y_train), test_data = get_data_for_model(
train_dir,
vocabulary,
test_dir=test_dir,
nn_model=self.keras_model,
as_generator=False,
batch_size=batch_size,
word2vec_model=self.word2vec_model,
scaler=self.scaler,
)
return self.keras_model.fit(
x_train,
y_train,
batch_size=batch_size,
epochs=epochs,
validation_data=test_data,
validation_split=test_ratio,
callbacks=callbacks or [],
verbose=verbose,
)
def batch_train(self, train_dir, vocabulary, test_dir=None, callbacks=None,
nn_model=NN_ARCHITECTURE, batch_size=BATCH_SIZE,
epochs=EPOCHS, verbose=1):
"""
Train the model on given data
:param train_dir: directory with data files. Text files should end with
'.txt' and corresponding files containing labels should end with '.lab'
:param vocabulary: iterable containing all considered labels
:param test_dir: directory with test files. They will be used to evaluate
the model after every epoch of training.
:param callbacks: objects passed to the Keras fit function as callbacks
:param nn_model: string defining the NN architecture e.g. 'crnn'
:param batch_size: size of one batch
:param epochs: number of epochs to train
:param verbose: 0, 1 or 2. As in Keras.
:return: History object
"""
if not self.word2vec_model:
raise RuntimeError('word2vec model is not trained. ' + 'Run train_word2vec() first.')
if not self.scaler:
raise RuntimeError('The scaler is not trained. ' + 'Run fit_scaler() first.')
if not os.path.isdir(train_dir):
raise ValueError('The training directory ' + train_dir + ' does not exist')
if test_dir and not os.path.isdir(test_dir):
raise ValueError('The test directory ' + test_dir + ' does not exist')
if self.keras_model:
print('WARNING! Overwriting already trained Keras model.', file=sys.stderr)
self.labels = vocabulary
self.keras_model = get_nn_model(
nn_model,
embedding=self.word2vec_model.vector_size,
output_length=len(vocabulary)
)
train_generator, test_data = get_data_for_model(
train_dir,
vocabulary,
test_dir=test_dir,
nn_model=self.keras_model,
as_generator=True,
batch_size=batch_size,
word2vec_model=self.word2vec_model,
scaler=self.scaler,
)
nb_of_files = len({filename[:-4] for filename in os.listdir(train_dir)})
steps_per_epoch = math.ceil(nb_of_files / batch_size)
return self.keras_model.fit_generator(
train_generator,
steps_per_epoch=steps_per_epoch,
epochs=epochs,
validation_data=test_data,
callbacks=callbacks or [],
verbose=verbose,
)
def predict_from_file(self, filepath):
"""
Predict labels for a txt file
:param filepath: path to the file
:return: list of labels with corresponding confidence intervals
"""
doc = Document(0, filepath)
return dict(self._predict(doc))
def predict_from_text(self, text):
"""
Predict labels for a given string of text
:param text: string or unicode with the text
:return: list of labels with corresponding confidence intervals
"""
doc = Document(0, None, text=text)
return dict(self._predict(doc))
def _predict(self, doc):
"""
Predict labels for a given Document object
:param doc: Document object
:return: list of labels with corresponding confidence intervals
"""
if type(self.keras_model.input) == list:
_, sample_length, embedding_size = self.keras_model.input_shape[0]
else:
_, sample_length, embedding_size = self.keras_model.input_shape
words = doc.get_all_words()[:sample_length]
x_matrix = np.zeros((1, sample_length, embedding_size))
for i, w in enumerate(words):
if w in self.word2vec_model.wv:
word_vector = self.word2vec_model.wv[w].reshape(1, -1)
scaled_vector = self.scaler.transform(word_vector, copy=True)[0]
x_matrix[doc.doc_id][i] = scaled_vector
if type(self.keras_model.input) == list:
x = [x_matrix] * len(self.keras_model.input)
else:
x = [x_matrix]
y_predicted = self.keras_model.predict(x)
zipped = zip(self.labels, y_predicted[0])
return sorted(zipped, key=lambda elem: elem[1], reverse=True)
def init_word_vectors(self, train_dir, vec_dim=EMBEDDING_SIZE):
"""
Train word2vec model and fit the scaler afterwards
:param train_dir: directory with '.txt' files
:param vec_dim: dimensionality of the word vectors
:return: None
"""
self.train_word2vec(train_dir, vec_dim=vec_dim)
self.fit_scaler(train_dir)
def train_word2vec(self, train_dir, vec_dim=EMBEDDING_SIZE):
"""
Train the word2vec model on a directory with text files.
:param train_dir: directory with '.txt' files
:param vec_dim: dimensionality of the word vectors
:return: trained gensim model
"""
if self.word2vec_model:
print('WARNING! Overwriting already trained word2vec model.',
file=sys.stderr)
self.word2vec_model = train_word2vec(train_dir, vec_dim=vec_dim)
return self.word2vec_model
def fit_scaler(self, train_dir):
"""
Fit a scaler on given data. Word vectors must be trained already.
:param train_dir: directory with '.txt' files
:return: fitted scaler object
"""
if not self.word2vec_model:
raise ValueError('word2vec model is not trained. Run train_word2vec() first.')
if self.scaler:
print('WARNING! Overwriting already fitted scaler.',
file=sys.stderr)
self.scaler = fit_scaler(train_dir, word2vec_model=self.word2vec_model)
return self.scaler
def save_scaler(self, filepath, overwrite=False):
""" Save the scaler object to a file """
if not self.scaler:
raise ValueError("Can't save the scaler, it has not been trained yet")
save_to_disk(filepath, self.scaler, overwrite=overwrite)
def load_scaler(self, filepath):
""" Load the scaler object from a file """
self.scaler = load_from_disk(filepath)
def save_word2vec_model(self, filepath, overwrite=False):
""" Save the word2vec model to a file """
if not self.word2vec_model:
raise ValueError("Can't save the word2vec model, it has not been trained yet")
save_to_disk(filepath, self.word2vec_model, overwrite=overwrite)
def load_word2vec_model(self, filepath):
""" Load the word2vec model from a file """
self.word2vec_model = load_from_disk(filepath)
def save_model(self, filepath):
""" Save the keras NN model to a HDF5 file """
if not self.keras_model:
raise ValueError("Can't save the model, it has not been trained yet")
if os.path.exists(filepath):
raise ValueError("File " + filepath + " already exists!")
self.keras_model.save(filepath)
def load_model(self, filepath):
""" Load the keras NN model from a HDF5 file """
if not os.path.exists(filepath):
raise ValueError("File " + filepath + " does not exist")
self.keras_model = keras.models.load_model(filepath)