Source code for models.fasttext_mod

# -*- coding: utf-8 -*-
"""
FastTextModel: A wrapper for Facebook fastText model

Author: Evgenii Nikitin <e.nikitin@nyu.edu>

Part of https://github.com/crazyfrogspb/RedditScore project

Copyright (c) 2018 Evgenii Nikitin. All rights reserved.
This work is licensed under the terms of the MIT license.
"""

import os
import pickle
import tempfile
import warnings
from collections import Sequence

import numpy as np
import pandas as pd

import fastText
from sklearn.base import BaseEstimator, ClassifierMixin

from . import redditmodel


def chunking_dot(big_matrix, small_matrix, chunk_size=50000):
    # dot product in chunks
    small_matrix = np.ascontiguousarray(small_matrix)
    R = np.empty((big_matrix.shape[0], small_matrix.shape[1]))
    for i in range(0, R.shape[0], chunk_size):
        end = i + chunk_size
        R[i:end] = np.dot(big_matrix[i:end], small_matrix)
    return R


[docs]def load_model(filepath): """Load pickled model. Parameters ---------- filepath : str Path to the file where the model will be saved. NOTE: the directory has to contain two files with provided name: with '.pkl' and 'bin' file extensions. Returns ------- FastTextModel Unpickled model object. """ with open(os.path.splitext(filepath)[0] + '.pkl', 'rb') as f: model = pickle.load(f) model.model._model = fastText.load_model( os.path.splitext(filepath)[0] + '.bin') return model
def check_multilabel(y): if y is None: return False elif isinstance(y[0], Sequence) and not isinstance(y[0], str): return True else: return False def _data_to_temp(X, label, y=None): # Generate temorary file fd, path = tempfile.mkstemp() multilabel = check_multilabel(y) with os.fdopen(fd, 'w') as tmp: for i in range(X.shape[0]): if y is not None: doc = '' if multilabel: for true_label in y[i]: doc += label + true_label + ' ' else: doc += '{}{} '.format(label, y[i]) if isinstance(X[i], list): doc += ' '.join(X[i]) elif isinstance(X[i], str): doc += X[i] else: raise ValueError( 'X has to be a sequence of tokens or strings') else: if isinstance(X[i], list): doc = ' '.join(X[i]) elif isinstance(X[i], str): doc = X[i] else: raise ValueError( 'X has to be a sequence of tokens or strings') tmp.write("%s\n" % doc) return path
[docs]class FastTextClassifier(BaseEstimator, ClassifierMixin): # Auxiliary sklearn-style wrapper for fastText python library def __init__(self, lr=0.1, dim=100, ws=5, epoch=5, minCount=1, minCountLabel=0, minn=0, maxn=0, neg=5, wordNgrams=1, loss="softmax", bucket=2000000, thread=12, lrUpdateRate=100, t=1e-4, label="__label__", verbose=2): self.lr = lr self.dim = dim self.ws = ws self.epoch = epoch self.minCount = minCount self.minCountLabel = minCountLabel self.minn = minn self.maxn = maxn self.neg = neg self.wordNgrams = wordNgrams self.loss = loss self.bucket = bucket self.thread = thread self.lrUpdateRate = lrUpdateRate self.t = t self.label = label self.verbose = verbose self._model = None self._num_classes = None def fit(self, X, y): # Fit model path = _data_to_temp(X, self.label, y) self._num_classes = len(np.unique(y)) self._model = fastText.train_supervised(path, lr=self.lr, dim=self.dim, ws=self.ws, epoch=self.epoch, minCount=self.minCount, minCountLabel=self.minCountLabel, minn=self.minn, maxn=self.maxn, neg=self.neg, wordNgrams=self.wordNgrams, loss=self.loss, bucket=self.bucket, thread=self.thread, lrUpdateRate=self.lrUpdateRate, t=self.t, label=self.label, verbose=self.verbose) os.remove(path) return self def predict(self, X): # Return predictions if isinstance(X[0], list): docs = [' '.join(doc) for doc in X] elif isinstance(X[0], str): docs = list(X) else: raise ValueError("X has to contrain sequence of tokens or strings") predictions = self._model.predict(docs, k=1)[0] return np.array([pred[0][len(self.label):] for pred in predictions]) def predict_proba(self, X): # Return predicted probabilities if isinstance(X[0], list): docs = [' '.join(doc) for doc in X] elif isinstance(X[0], str): docs = list(X) else: raise ValueError("X has to contrain sequence of tokens or strings") predictions = zip(*self._model.predict(docs, k=self._num_classes)) probabilities = [] for pred in predictions: d = {key[len(self.label):]: value for key, value in zip(*pred)} probabilities.append(d) return pd.DataFrame(probabilities).fillna(1e-10)
[docs]class FastTextModel(redditmodel.RedditModel): """Facebook fastText classifier Parameters ---------- random_state : int, optional Random seed (the default is 24). **kwargs Other parameters for fastText model. Full description can be found here: https://github.com/facebookresearch/fastText """ def __init__(self, random_state=24, **kwargs): super().__init__(random_state=random_state) self.model = FastTextClassifier(**kwargs)
[docs] def fit(self, X, y): """Fit model Parameters ---------- X: iterable, shape (n_samples, ) Sequence of tokenized documents y: iterable, shape (n_samples, ) Sequence of labels Returns ------- FastTextModel Fitted model object """ if not isinstance(X, np.ndarray): X = np.array(X) if not isinstance(y, np.ndarray): y = np.array(y) if check_multilabel(y): unique_labels = np.unique(list(redditmodel.flatten(y))) else: unique_labels = np.unique(y) self.classes_ = np.array(sorted(unique_labels)) self.model.fit(X, y) fd, path = tempfile.mkstemp() self.model._model.save_softmax(path) emb = pd.read_csv( path, skiprows=[0], delimiter=' ', header=None).dropna(axis=1) emb = emb.round(decimals=5) emb[0] = emb[0].str[len(self.model.label):] emb.set_index(0, inplace=True) self.class_embeddings = emb.loc[self.classes_] os.remove(path) self.fitted = True return self
[docs] def save_model(self, filepath): """Save model to disk. Parameters ---------- filepath : str Path to the file where the model will be saved. NOTE: The model will be saved in two files: with '.pkl' and 'bin' file extensions. """ self.model._model.save_model(os.path.splitext(filepath)[0] + '.bin') self.model._model = None with open(os.path.splitext(filepath)[0] + '.pkl', 'wb') as f: pickle.dump(self, f) self.model._model = fastText.load_model( os.path.splitext(filepath)[0] + '.bin')