CofeehousePy/dltc/coffeehouse_dltc/base/word2vec.py

from __future__ import print_function, unicode_literals
import os
import six
import numpy as np
from functools import reduce

from gensim.models import Word2Vec
from sklearn.preprocessing import StandardScaler

from coffeehouse_dltc.base.document import Document
from coffeehouse_dltc.config import EMBEDDING_SIZE, WORD2VEC_WORKERS, MIN_WORD_COUNT, \
    WORD2VEC_CONTEXT
from coffeehouse_dltc.utils import get_documents, save_to_disk


def train_word2vec_in_memory(docs, vec_dim=EMBEDDING_SIZE):
    """
    Builds word embeddings from documents and return a model
    :param docs: list of Document objects
    :param vec_dim: the dimensionality of the vector that's being built

    :return: trained gensim object with word embeddings
    """
    doc_sentences = map(lambda d: d.read_sentences(), docs)
    all_sentences = reduce(lambda d1, d2: d1 + d2, doc_sentences)

    # Initialize and train the model
    model = Word2Vec(
        all_sentences,
        workers=WORD2VEC_WORKERS,
        size=vec_dim,
        min_count=MIN_WORD_COUNT,
        window=WORD2VEC_CONTEXT,
    )

    # If you don't plan to train the model any further, calling
    # init_sims will make the model much more memory-efficient.
    model.init_sims(replace=True)

    return model


def compute_word2vec_for_phrase(phrase, model):
    """
    Compute (add) word embedding for a multiword phrase using a given model
    :param phrase: unicode, parsed label of a keyphrase
    :param model: gensim word2vec object

    :return: numpy array
    """
    result = np.zeros(model.vector_size, dtype='float32')
    for word in phrase.split():
        if word in model.wv:
            result += model.wv[word]

    return result


def fit_scaler(data_dir, word2vec_model, batch_size=1024, persist_to_path=None):
    """ Get all the word2vec vectors in a 2D matrix and fit the scaler on it.
     This scaler can be used afterwards for normalizing feature matrices. """
    if type(word2vec_model) == str:
        word2vec_model = Word2Vec.load(word2vec_model)

    doc_generator = get_documents(data_dir)
    scaler = StandardScaler(copy=False)

    no_more_samples = False
    while not no_more_samples:
        batch = []
        for i in range(batch_size):
            try:
                batch.append(six.next(doc_generator))
            except StopIteration:
                no_more_samples = True
                break

        vectors = []
        for doc in batch:
            for word in doc.get_all_words():
                if word in word2vec_model.wv:
                    vectors.append(word2vec_model.wv[word])

        matrix = np.array(vectors)
        print("Fitted to {} vectors".format(matrix.shape[0]))

        scaler.partial_fit(matrix)

    if persist_to_path:
        save_to_disk(persist_to_path, scaler)

    return scaler


def train_word2vec(doc_directory, vec_dim=EMBEDDING_SIZE):
    """
    Train the Word2Vec object iteratively, loading stuff to memory one by one.
    :param doc_directory: directory with the documents
    :param vec_dim: the dimensionality of the vector that's being built

    :return: Word2Vec object
    """
    class SentenceIterator(object):
        def __init__(self, dirname):
            self.dirname = dirname

        def __iter__(self):
            files = {filename[:-4] for filename in os.listdir(self.dirname)}
            for doc_id, fname in enumerate(files):
                d = Document(doc_id, os.path.join(self.dirname, fname + '.txt'))
                for sentence in d.read_sentences():
                    yield sentence

    # Initialize and train the model
    model = Word2Vec(
        SentenceIterator(doc_directory),
        workers=WORD2VEC_WORKERS,
        size=vec_dim,
        min_count=MIN_WORD_COUNT,
        window=WORD2VEC_CONTEXT,
    )

    # If you don't plan to train the model any further, calling
    # init_sims will make the model much more memory-efficient.
    model.init_sims(replace=True)

    return model