from __future__ import unicode_literals, print_function, division import math import os import sys import json import keras.models import numpy as np from coffeehouse_dltc.base.document import Document from coffeehouse_dltc.base.word2vec import train_word2vec, fit_scaler from coffeehouse_dltc.config import NN_ARCHITECTURE, BATCH_SIZE, EMBEDDING_SIZE, EPOCHS from coffeehouse_dltc.nn.input_data import get_data_for_model from coffeehouse_dltc.nn.models import get_nn_model from coffeehouse_dltc.utils import save_to_disk, load_from_disk # noinspection DuplicatedCode class DLTC(object): def __init__(self): """ Public Constructor """ self.labels = None self.keras_model = None self.word2vec_model = None self.scaler = None def load_model_cluster(self, model_directory): """ Loads the model cluster into memory in which the model can be used to be predicted from :param model_directory: The directory which contains the model files such as .che, .chs, .chm and .chl :return: None """ if not os.path.exists(model_directory): raise FileNotFoundError("The model directory does not exist") embeddings_path = os.path.join(model_directory, "{0}.che". format(os.path.basename(model_directory[:-6]))) scaler_path = os.path.join(model_directory, "{0}.chs". format(os.path.basename(model_directory[:-6]))) model_file_path = os.path.join(model_directory, "{0}.chm". format(os.path.basename(model_directory[:-6]))) labels_file_path = os.path.join(model_directory, "{0}.chl". format(os.path.basename(model_directory[:-6]))) if not os.path.exists(embeddings_path): raise FileNotFoundError("The embeddings model was not found ('{0}')". format(embeddings_path)) if not os.path.exists(scaler_path): raise FileNotFoundError("The scaler model was not found ('{0}')". format(scaler_path)) if not os.path.exists(model_file_path): raise FileNotFoundError("The classification model was not found ('{0}')". format(model_file_path)) if not os.path.exists(labels_file_path): raise FileNotFoundError("The labels file was not found ('{0}')". format(labels_file_path)) # Read the labels file with open(labels_file_path, 'r') as f: self.labels = json.load(f) self.load_model(model_file_path) self.load_word2vec_model(embeddings_path) self.load_scaler(scaler_path) def train(self, train_dir, vocabulary, test_dir=None, callbacks=None, nn_model=NN_ARCHITECTURE, batch_size=BATCH_SIZE, test_ratio=0.0, epochs=EPOCHS, verbose=1): """ Train the model on given data :param train_dir: directory with data files. Text files should end with '.txt' and corresponding files containing labels should end with '.lab' :param vocabulary: iterable containing all considered labels :param test_dir: directory with test files. They will be used to evaluate the model after every epoch of training. :param callbacks: objects passed to the Keras fit function as callbacks :param nn_model: string defining the NN architecture e.g. 'crnn' :param batch_size: size of one batch :param test_ratio: the ratio of samples that will be withheld from training and used for testing. This can be overridden by test_dir. :param epochs: number of epochs to train :param verbose: 0, 1 or 2. As in Keras. :return: History object """ if not self.word2vec_model: raise RuntimeError('word2vec model is not trained. ' + 'Run train_word2vec() first.') if not self.scaler: raise RuntimeError('The scaler is not trained. ' + 'Run fit_scaler() first.') if not os.path.isdir(train_dir): raise ValueError('The training directory ' + train_dir + ' does not exist') if test_dir and not os.path.isdir(test_dir): raise ValueError('The test directory ' + test_dir + ' does not exist') if self.keras_model: print('WARNING! Overwriting already trained Keras model.', file=sys.stderr) self.labels = vocabulary self.keras_model = get_nn_model( nn_model, embedding=self.word2vec_model.vector_size, output_length=len(vocabulary) ) (x_train, y_train), test_data = get_data_for_model( train_dir, vocabulary, test_dir=test_dir, nn_model=self.keras_model, as_generator=False, batch_size=batch_size, word2vec_model=self.word2vec_model, scaler=self.scaler, ) return self.keras_model.fit( x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=test_data, validation_split=test_ratio, callbacks=callbacks or [], verbose=verbose, ) def batch_train(self, train_dir, vocabulary, test_dir=None, callbacks=None, nn_model=NN_ARCHITECTURE, batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=1): """ Train the model on given data :param train_dir: directory with data files. Text files should end with '.txt' and corresponding files containing labels should end with '.lab' :param vocabulary: iterable containing all considered labels :param test_dir: directory with test files. They will be used to evaluate the model after every epoch of training. :param callbacks: objects passed to the Keras fit function as callbacks :param nn_model: string defining the NN architecture e.g. 'crnn' :param batch_size: size of one batch :param epochs: number of epochs to train :param verbose: 0, 1 or 2. As in Keras. :return: History object """ if not self.word2vec_model: raise RuntimeError('word2vec model is not trained. ' + 'Run train_word2vec() first.') if not self.scaler: raise RuntimeError('The scaler is not trained. ' + 'Run fit_scaler() first.') if not os.path.isdir(train_dir): raise ValueError('The training directory ' + train_dir + ' does not exist') if test_dir and not os.path.isdir(test_dir): raise ValueError('The test directory ' + test_dir + ' does not exist') if self.keras_model: print('WARNING! Overwriting already trained Keras model.', file=sys.stderr) self.labels = vocabulary self.keras_model = get_nn_model( nn_model, embedding=self.word2vec_model.vector_size, output_length=len(vocabulary) ) train_generator, test_data = get_data_for_model( train_dir, vocabulary, test_dir=test_dir, nn_model=self.keras_model, as_generator=True, batch_size=batch_size, word2vec_model=self.word2vec_model, scaler=self.scaler, ) nb_of_files = len({filename[:-4] for filename in os.listdir(train_dir)}) steps_per_epoch = math.ceil(nb_of_files / batch_size) return self.keras_model.fit_generator( train_generator, steps_per_epoch=steps_per_epoch, epochs=epochs, validation_data=test_data, callbacks=callbacks or [], verbose=verbose, ) def predict_from_file(self, filepath): """ Predict labels for a txt file :param filepath: path to the file :return: list of labels with corresponding confidence intervals """ doc = Document(0, filepath) return dict(self._predict(doc)) def predict_from_text(self, text): """ Predict labels for a given string of text :param text: string or unicode with the text :return: list of labels with corresponding confidence intervals """ doc = Document(0, None, text=text) return dict(self._predict(doc)) def _predict(self, doc): """ Predict labels for a given Document object :param doc: Document object :return: list of labels with corresponding confidence intervals """ if type(self.keras_model.input) == list: _, sample_length, embedding_size = self.keras_model.input_shape[0] else: _, sample_length, embedding_size = self.keras_model.input_shape words = doc.get_all_words()[:sample_length] x_matrix = np.zeros((1, sample_length, embedding_size)) for i, w in enumerate(words): if w in self.word2vec_model.wv: word_vector = self.word2vec_model.wv[w].reshape(1, -1) scaled_vector = self.scaler.transform(word_vector, copy=True)[0] x_matrix[doc.doc_id][i] = scaled_vector if type(self.keras_model.input) == list: x = [x_matrix] * len(self.keras_model.input) else: x = [x_matrix] y_predicted = self.keras_model.predict(x) zipped = zip(self.labels, y_predicted[0]) return sorted(zipped, key=lambda elem: elem[1], reverse=True) def init_word_vectors(self, train_dir, vec_dim=EMBEDDING_SIZE): """ Train word2vec model and fit the scaler afterwards :param train_dir: directory with '.txt' files :param vec_dim: dimensionality of the word vectors :return: None """ self.train_word2vec(train_dir, vec_dim=vec_dim) self.fit_scaler(train_dir) def train_word2vec(self, train_dir, vec_dim=EMBEDDING_SIZE): """ Train the word2vec model on a directory with text files. :param train_dir: directory with '.txt' files :param vec_dim: dimensionality of the word vectors :return: trained gensim model """ if self.word2vec_model: print('WARNING! Overwriting already trained word2vec model.', file=sys.stderr) self.word2vec_model = train_word2vec(train_dir, vec_dim=vec_dim) return self.word2vec_model def fit_scaler(self, train_dir): """ Fit a scaler on given data. Word vectors must be trained already. :param train_dir: directory with '.txt' files :return: fitted scaler object """ if not self.word2vec_model: raise ValueError('word2vec model is not trained. Run train_word2vec() first.') if self.scaler: print('WARNING! Overwriting already fitted scaler.', file=sys.stderr) self.scaler = fit_scaler(train_dir, word2vec_model=self.word2vec_model) return self.scaler def save_scaler(self, filepath, overwrite=False): """ Save the scaler object to a file """ if not self.scaler: raise ValueError("Can't save the scaler, it has not been trained yet") save_to_disk(filepath, self.scaler, overwrite=overwrite) def load_scaler(self, filepath): """ Load the scaler object from a file """ self.scaler = load_from_disk(filepath) def save_word2vec_model(self, filepath, overwrite=False): """ Save the word2vec model to a file """ if not self.word2vec_model: raise ValueError("Can't save the word2vec model, it has not been trained yet") save_to_disk(filepath, self.word2vec_model, overwrite=overwrite) def load_word2vec_model(self, filepath): """ Load the word2vec model from a file """ self.word2vec_model = load_from_disk(filepath) def save_model(self, filepath): """ Save the keras NN model to a HDF5 file """ if not self.keras_model: raise ValueError("Can't save the model, it has not been trained yet") if os.path.exists(filepath): raise ValueError("File " + filepath + " already exists!") self.keras_model.save(filepath) def load_model(self, filepath): """ Load the keras NN model from a HDF5 file """ if not os.path.exists(filepath): raise ValueError("File " + filepath + " does not exist") self.keras_model = keras.models.load_model(filepath)