Source code for redditscore.models.doc2vec

from random import shuffle

import numpy as np
import pandas as pd
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from scipy.spatial.distance import cdist
from sklearn.linear_model import LogisticRegression

from . import redditmodel


[docs]class Doc2VecModel(redditmodel.RedditModel): def __init__(self, random_state=24, dm=0, vector_size=100, window=5, negative=5, hs=0, min_count=5, sample=1e-5, epochs=10, dbow_words=0, workers=8, steps=1000, alpha=0.025): super().__init__(random_state=random_state) self.model_type = 'doc2vec' self._model = LogisticRegression() self.dm = dm self.vector_size = vector_size self.window = window self.negative = negative self.hs = hs self.min_count = min_count self.sample = sample self.epochs = epochs self.dbow_words = dbow_words self.workers = workers self.steps = steps self.alpha = alpha @staticmethod def _prepare_data(X, y): alldocs = [] total_documents = 0 for doc, label in zip(X, y): td = TaggedDocument(doc, [label]) alldocs.append(td) total_documents += 1 return alldocs, total_documents def _calc_sims(self, docs, steps, alpha): vectors = np.zeros((len(docs), self.vector_size)) for i, doc in enumerate(docs): vectors[i, :] = self._doc2vec.infer_vector(doc, steps=steps, alpha=alpha) sims = 1 - cdist(vectors, self.class_embeddings, metric='cosine') return pd.DataFrame(sims, columns=self._classes)
[docs] def fit(self, X, y): self._doc2vec = Doc2Vec(dm=self.dm, vector_size=self.vector_size, window=self.window, negative=self.negative, hs=self.hs, min_count=self.min_count, sample=self.sample, epochs=self.epochs, dbow_words=self.dbow_words, workers=self.workers) self._classes = sorted(np.unique(y)) alldocs, total_documents = self._prepare_data(X, y) doclist = alldocs[:] self._doc2vec.build_vocab(alldocs) np.random.seed(self.random_state) shuffle(doclist) self._doc2vec.train(doclist, epochs=self._doc2vec.epochs, total_examples=total_documents) emb = np.zeros((len(self._classes), self.vector_size)) for i, class_label in enumerate(self._classes): emb[i, :] = self._doc2vec.docvecs[class_label] self.class_embeddings = pd.DataFrame(emb, index=self._classes) sims = self._calc_sims(X, self.steps, self.alpha) self._model.fit(sims, y) self.fitted = True
[docs] def predict(self, X): sims = self._calc_sims(X, self.steps, self.alpha) return self._model.predict(sims)
[docs] def predict_proba(self, X): sims = self._calc_sims(X, self.steps, self.alpha) return self._model.predict_proba(sims)