CofeehousePy/dltc/coffeehouse_dltc/base/word2vec.py

128 lines
3.9 KiB
Python

from __future__ import print_function, unicode_literals
import os
import six
import numpy as np
from functools import reduce
from gensim.models import Word2Vec
from sklearn.preprocessing import StandardScaler
from coffeehouse_dltc.base.document import Document
from coffeehouse_dltc.config import EMBEDDING_SIZE, WORD2VEC_WORKERS, MIN_WORD_COUNT, \
WORD2VEC_CONTEXT
from coffeehouse_dltc.utils import get_documents, save_to_disk
def train_word2vec_in_memory(docs, vec_dim=EMBEDDING_SIZE):
"""
Builds word embeddings from documents and return a model
:param docs: list of Document objects
:param vec_dim: the dimensionality of the vector that's being built
:return: trained gensim object with word embeddings
"""
doc_sentences = map(lambda d: d.read_sentences(), docs)
all_sentences = reduce(lambda d1, d2: d1 + d2, doc_sentences)
# Initialize and train the model
model = Word2Vec(
all_sentences,
workers=WORD2VEC_WORKERS,
size=vec_dim,
min_count=MIN_WORD_COUNT,
window=WORD2VEC_CONTEXT,
)
# If you don't plan to train the model any further, calling
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)
return model
def compute_word2vec_for_phrase(phrase, model):
"""
Compute (add) word embedding for a multiword phrase using a given model
:param phrase: unicode, parsed label of a keyphrase
:param model: gensim word2vec object
:return: numpy array
"""
result = np.zeros(model.vector_size, dtype='float32')
for word in phrase.split():
if word in model.wv:
result += model.wv[word]
return result
def fit_scaler(data_dir, word2vec_model, batch_size=1024, persist_to_path=None):
""" Get all the word2vec vectors in a 2D matrix and fit the scaler on it.
This scaler can be used afterwards for normalizing feature matrices. """
if type(word2vec_model) == str:
word2vec_model = Word2Vec.load(word2vec_model)
doc_generator = get_documents(data_dir)
scaler = StandardScaler(copy=False)
no_more_samples = False
while not no_more_samples:
batch = []
for i in range(batch_size):
try:
batch.append(six.next(doc_generator))
except StopIteration:
no_more_samples = True
break
vectors = []
for doc in batch:
for word in doc.get_all_words():
if word in word2vec_model.wv:
vectors.append(word2vec_model.wv[word])
matrix = np.array(vectors)
print("Fitted to {} vectors".format(matrix.shape[0]))
scaler.partial_fit(matrix)
if persist_to_path:
save_to_disk(persist_to_path, scaler)
return scaler
def train_word2vec(doc_directory, vec_dim=EMBEDDING_SIZE):
"""
Train the Word2Vec object iteratively, loading stuff to memory one by one.
:param doc_directory: directory with the documents
:param vec_dim: the dimensionality of the vector that's being built
:return: Word2Vec object
"""
class SentenceIterator(object):
def __init__(self, dirname):
self.dirname = dirname
def __iter__(self):
files = {filename[:-4] for filename in os.listdir(self.dirname)}
for doc_id, fname in enumerate(files):
d = Document(doc_id, os.path.join(self.dirname, fname + '.txt'))
for sentence in d.read_sentences():
yield sentence
# Initialize and train the model
model = Word2Vec(
SentenceIterator(doc_directory),
workers=WORD2VEC_WORKERS,
size=vec_dim,
min_count=MIN_WORD_COUNT,
window=WORD2VEC_CONTEXT,
)
# If you don't plan to train the model any further, calling
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)
return model