Source code for models.bow_mod

# -*- coding: utf-8 -*-
"""
bow_mod: A wrapper for Bag-of-Words models

Author: Evgenii Nikitin <e.nikitin@nyu.edu>

Part of https://github.com/crazyfrogspb/RedditScore project

Copyright (c) 2018 Evgenii Nikitin. All rights reserved.
This work is licensed under the terms of the MIT license.
"""

import dill as pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline

from . import redditmodel


[docs]def load_model(filepath):
    """Loan pickled instance of SklearnModel.

    Parameters
    ----------
    filepath : str
        Path to the pickled model file.

    Returns
    -------
    SklearnModel
        Unpickled model.

    """
    with open(filepath, 'rb') as f:
        model = pickle.load(f)
    return model


[docs]class BoWModel(redditmodel.RedditModel):
    """A wrapper for Bag-of-Words models with or without tf-idf re-weighting

    Parameters
    ----------
    estimator: scikit-learn model
        Estimator object (classifier or regressor)

    ngrams: int, optional
        The upper boundary of the range of n-values for different n-grams to be extracted

    tfidf: bool, optional
        If true, use tf-idf re-weighting

    random_state: integer, optional
        Random seed

    **kwargs
         Parameters of the multinomial model. For details check scikit-learn documentation.

    Attributes
    ----------
    params : dict
        Dictionary with model parameters
    """

    def __init__(self, estimator, ngrams=1, tfidf=True, random_state=24):
        super().__init__(random_state=random_state)
        self.ngrams = ngrams
        self.tfidf = tfidf

        if self.tfidf:
            vectorizer = TfidfVectorizer(
                analyzer=self._build_analyzer(self.ngrams))
        else:
            vectorizer = CountVectorizer(
                analyzer=self._build_analyzer(self.ngrams))
        self.model = Pipeline(
            [('vectorizer', vectorizer), ('model', estimator)])

[docs]    def set_params(self, **params):
        """
        Set the parameters of the model.

        Parameters
        ----------
        **params: {'tfidf', 'ngrams', 'random_state'} or
            parameters of the corresponding models
        """
        for key, value in params.items():
            if hasattr(self, key):
                setattr(self, key, value)
            elif hasattr(self.model.named_steps['model'], key):
                setattr(self.model.named_steps['model'], key, value)

        if self.tfidf:
            vectorizer = TfidfVectorizer(
                analyzer=self._build_analyzer(self.ngrams))
        else:
            vectorizer = CountVectorizer(
                analyzer=self._build_analyzer(self.ngrams))

        self.model = Pipeline(
            [('vectorizer', vectorizer),
             ('model', self.model.named_steps['model'])])

        return self

[docs]    def save_model(self, filepath):
        """Save model to disk.

        Parameters
        ----------
        filepath : str
            Path to the file where the model will be sabed.

        """
        with open(filepath, 'wb') as f:
            pickle.dump(self, f)

    @staticmethod
    def _build_analyzer(ngrams):
        # Build analyzer for vectorizers for a given ngram range
        return lambda doc: redditmodel.word_ngrams(doc, (1, ngrams))