128 lines
4.0 KiB
Python
128 lines
4.0 KiB
Python
from __future__ import print_function, unicode_literals
|
|
import os
|
|
import six
|
|
import numpy as np
|
|
from functools import reduce
|
|
|
|
from gensim.models import Word2Vec
|
|
from sklearn.preprocessing import StandardScaler
|
|
|
|
from coffeehouse_dltc.base.document import Document
|
|
from coffeehouse_dltc.config import EMBEDDING_SIZE, WORD2VEC_WORKERS, MIN_WORD_COUNT, \
|
|
WORD2VEC_CONTEXT
|
|
from coffeehouse_dltc.utils import get_documents, save_to_disk
|
|
|
|
|
|
def train_word2vec_in_memory(docs, vec_dim=EMBEDDING_SIZE):
|
|
"""
|
|
Builds word embeddings from documents and return a model
|
|
:param docs: list of Document objects
|
|
:param vec_dim: the dimensionality of the vector that's being built
|
|
|
|
:return: trained gensim object with word embeddings
|
|
"""
|
|
doc_sentences = map(lambda d: d.read_sentences(), docs)
|
|
all_sentences = reduce(lambda d1, d2: d1 + d2, doc_sentences)
|
|
|
|
# Initialize and train the model
|
|
model = Word2Vec(
|
|
all_sentences,
|
|
workers=WORD2VEC_WORKERS,
|
|
size=vec_dim,
|
|
min_count=MIN_WORD_COUNT,
|
|
window=WORD2VEC_CONTEXT,
|
|
)
|
|
|
|
# If you don't plan to train the model any further, calling
|
|
# init_sims will make the model much more memory-efficient.
|
|
model.init_sims(replace=True)
|
|
|
|
return model
|
|
|
|
|
|
def compute_word2vec_for_phrase(phrase, model):
|
|
"""
|
|
Compute (add) word embedding for a multiword phrase using a given model
|
|
:param phrase: unicode, parsed label of a keyphrase
|
|
:param model: gensim word2vec object
|
|
|
|
:return: numpy array
|
|
"""
|
|
result = np.zeros(model.vector_size, dtype='float32')
|
|
for word in phrase.split():
|
|
if word in model.wv:
|
|
result += model.wv[word]
|
|
|
|
return result
|
|
|
|
|
|
def fit_scaler(data_dir, word2vec_model, batch_size=1024, persist_to_path=None):
|
|
""" Get all the word2vec vectors in a 2D matrix and fit the scaler on it.
|
|
This scaler can be used afterwards for normalizing feature matrices. """
|
|
if type(word2vec_model) == str:
|
|
word2vec_model = Word2Vec.load(word2vec_model)
|
|
|
|
doc_generator = get_documents(data_dir)
|
|
scaler = StandardScaler(copy=False)
|
|
|
|
no_more_samples = False
|
|
while not no_more_samples:
|
|
batch = []
|
|
for i in range(batch_size):
|
|
try:
|
|
batch.append(six.next(doc_generator))
|
|
except StopIteration:
|
|
no_more_samples = True
|
|
break
|
|
|
|
vectors = []
|
|
for doc in batch:
|
|
for word in doc.get_all_words():
|
|
if word in word2vec_model.wv:
|
|
vectors.append(word2vec_model.wv[word])
|
|
|
|
matrix = np.array(vectors)
|
|
print("Fitted to {} vectors".format(matrix.shape[0]))
|
|
|
|
scaler.partial_fit(matrix)
|
|
|
|
if persist_to_path:
|
|
save_to_disk(persist_to_path, scaler)
|
|
|
|
return scaler
|
|
|
|
|
|
def train_word2vec(doc_directory, vec_dim=EMBEDDING_SIZE):
|
|
"""
|
|
Train the Word2Vec object iteratively, loading stuff to memory one by one.
|
|
:param doc_directory: directory with the documents
|
|
:param vec_dim: the dimensionality of the vector that's being built
|
|
|
|
:return: Word2Vec object
|
|
"""
|
|
class SentenceIterator(object):
|
|
def __init__(self, dirname):
|
|
self.dirname = dirname
|
|
|
|
def __iter__(self):
|
|
files = {filename[:-4] for filename in os.listdir(self.dirname)}
|
|
for doc_id, fname in enumerate(files):
|
|
d = Document(doc_id, os.path.join(self.dirname, fname + '.txt'))
|
|
for sentence in d.read_sentences():
|
|
yield sentence
|
|
|
|
# Initialize and train the model
|
|
model = Word2Vec(
|
|
SentenceIterator(doc_directory),
|
|
workers=WORD2VEC_WORKERS,
|
|
size=vec_dim,
|
|
min_count=MIN_WORD_COUNT,
|
|
window=WORD2VEC_CONTEXT,
|
|
)
|
|
|
|
# If you don't plan to train the model any further, calling
|
|
# init_sims will make the model much more memory-efficient.
|
|
model.init_sims(replace=True)
|
|
|
|
return model
|