Source code for redditscore.get_reddit_data

import calendar
import datetime
import os

from pandas.io import gbq


[docs]def diff_month(d1, d2): # Get difference between dates in months return (d1.year - d2.year) * 12 + d1.month - d2.month
[docs]def add_months(sourcedate, months): # Add months to the date month = sourcedate.month - 1 + months year = sourcedate.year + month // 12 month = month % 12 + 1 day = min(sourcedate.day, calendar.monthrange(year, month)[1]) return datetime.date(year, month, day)
[docs]def check_input(subreddits, usernames): if subreddits: subreddits = '", "'.join(subreddits) subreddits = '"' + subreddits + '"' subreddits = 'AND subreddit in (' + subreddits + ')' else: subreddits = '' if usernames: usernames = '", "'.join(usernames) usernames = '"' + usernames + '"' usernames = 'AND author in (' + usernames + ')' else: usernames = '' return subreddits, usernames
[docs]def construct_query(subreddits, usernames, month, score_limit=None): # Construct a query string subreddits, usernames = check_input(subreddits, usernames) if score_limit is None: score = "" else: score = " AND score >= {}".format(int(score_limit)) query = """ SELECT id, body, subreddit, author, created_utc, link_id, parent_id, score FROM [fh-bigquery:reddit_comments.""" + month + """] WHERE body != '[deleted]' AND body != '[removed]' AND body NOT LIKE '%has been removed%' AND body NOT LIKE '%has been overwritten%' AND body NOT LIKE '%performed automatically%' AND body NOT LIKE '%bot action performed%' AND body NOT LIKE '%autowikibot%' AND body NOT LIKE '%I am a bot%' AND LENGTH(body) > 0""" + score + subreddits + usernames return query
[docs]def construct_sample_score_query(subreddits, usernames, month, sample_size, score_limit=None): # Construct a query with sampling top-scoring comments subreddits, usernames = check_input(subreddits, usernames) if score_limit is None: score = "" else: score = " AND score >= {}".format(int(score_limit)) query = """ SELECT id, body, subreddit, author, created_utc, link_id, parent_id, score FROM ( SELECT id, body, subreddit, author, created_utc, link_id, parent_id, score, ROW_NUMBER() OVER(PARTITION BY subreddit ORDER BY score DESC) as pos FROM [fh-bigquery:reddit_comments.""" + month + """] WHERE body != '[deleted]' AND body != '[removed]' AND body NOT LIKE '%has been removed%' AND body NOT LIKE '%has been overwritten%' AND body NOT LIKE '%performed automatically%' AND body NOT LIKE '%bot action performed%' AND body NOT LIKE '%autowikibot%' AND body NOT LIKE '%I am a bot%' AND LENGTH(body) > 0""" + score + subreddits + usernames + """ ) WHERE pos <= """ + str(sample_size) return query
[docs]def construct_sample_query(subreddits, usernames, month, sample_size, score_limit=None): # Constuct a query string with random sampling subreddits, usernames = check_input(subreddits, usernames) if score_limit is None: score = "" else: score = " AND score >= {}".format(int(score_limit)) query = """ SELECT id, body, subreddit, author, created_utc, link_id, parent_id, score FROM ( SELECT id, body, subreddit, author, created_utc, link_id, parent_id, score, RAND() as rnd, ROW_NUMBER() OVER(PARTITION BY subreddit ORDER BY rnd) as pos FROM [fh-bigquery:reddit_comments.""" + month + """] WHERE body != '[deleted]' AND body != '[removed]' AND body NOT LIKE '%has been removed%' AND body NOT LIKE '%has been overwritten%' AND body NOT LIKE '%performed automatically%' AND body NOT LIKE '%bot action performed%' AND body NOT LIKE '%autowikibot%' AND body NOT LIKE '%I am a bot%' AND LENGTH(body) > 0""" + score + subreddits + usernames + """ ) WHERE pos <= """ + str(sample_size) return query
[docs]def get_comments(timerange, project_id, private_key, subreddits=None, usernames=None, score_limit=None, comments_per_month=None, top_scores=False, csv_directory=None, verbose=False, configuration=None): """ Obtain Reddit comments using Google BigQuery Parameters ---------- timerange: iterable, shape (2,) Start and end dates in the '%Y_%m' format. Example: ('2016_08', '2017_02') project_id: str Google BigQuery Account project ID private_key: str File path to JSON file with service account private key https://cloud.google.com/bigquery/docs/reference/libraries subreddits: list, optional List of subreddit names usernames: list, optional List of usernames score_limit: int, optional Score limit for comment retrieving. If None, retrieve all comments. comments_per_month: int, optional Number of comments to sample from each subbredit per month. If None, retrieve all comments. top_scores: bool, optional If True, sample top-scoring comments in each subreddit instead of random sampling. csv_directory: str, optional CSV directory to save retrieved data. If None, return a DataFrame with all comments. verobse: bool, optional If True, print the name of the table, which is being queried. configuration: dict, optional Query config parameters for job processing. Returns ---------- dfs: list List of pd.DataFrames with comments """ if subreddits and not isinstance(subreddits, list): raise ValueError( 'subreddits argument must be a list, not {}'.format(type(subreddits))) if usernames and not isinstance(usernames, list): raise ValueError( 'usernames argument must be a list, not {}'.format(type(usernames))) if not usernames and not subreddits: raise ValueError( 'You have to specify a list of subreddits or a list of usernames') if (comments_per_month is not None) and \ not isinstance(comments_per_month, int): raise ValueError('comments_per_month must be an integer, not {}'.format( type(comments_per_month))) if (csv_directory is not None) and \ not os.path.isdir(csv_directory): raise OSError('{} does not exist'.format(csv_directory)) try: iter(timerange) except TypeError as e: raise TypeError('timerange argument must be an iterable') from e try: assert len(timerange) == 2 except AssertionError as e: raise ValueError( 'timerange argument has to contain only two elements') from e start = datetime.datetime.strptime(timerange[0], '%Y_%m') end = datetime.datetime.strptime(timerange[1], '%Y_%m') delta = diff_month(end, start) if csv_directory is None: dfs = [] else: dfs = None for i in range(delta + 1): date = add_months(start, i) table_name = date.strftime('%Y_%m') if verbose: print( 'Querying from [fh-bigquery:reddit_comments.{}]'.format(table_name)) if comments_per_month is None: query = construct_query( subreddits, usernames, table_name, score_limit) elif top_scores: query = construct_sample_score_query( subreddits, usernames, table_name, comments_per_month, score_limit) else: query = construct_sample_query( subreddits, usernames, table_name, comments_per_month, score_limit) df = gbq.read_gbq(query, project_id=project_id, private_key=private_key, configuration=configuration) if csv_directory is None: dfs.append(df) else: filename = os.path.join(csv_directory, table_name + '.csv') if os.path.isfile(filename): with open(filename, 'a') as f: df.to_csv(f, header=False, index=False) else: df.to_csv(os.path.join(csv_directory, table_name + '.csv'), index=False) return dfs