Source code for redditscore.get_reddit_data
import calendar
import datetime
import os
from pandas.io import gbq
[docs]def diff_month(d1, d2):
# Get difference between dates in months
return (d1.year - d2.year) * 12 + d1.month - d2.month
[docs]def add_months(sourcedate, months):
# Add months to the date
month = sourcedate.month - 1 + months
year = sourcedate.year + month // 12
month = month % 12 + 1
day = min(sourcedate.day, calendar.monthrange(year, month)[1])
return datetime.date(year, month, day)
[docs]def check_input(subreddits, usernames):
if subreddits:
subreddits = '", "'.join(subreddits)
subreddits = '"' + subreddits + '"'
subreddits = 'AND subreddit in (' + subreddits + ')'
else:
subreddits = ''
if usernames:
usernames = '", "'.join(usernames)
usernames = '"' + usernames + '"'
usernames = 'AND author in (' + usernames + ')'
else:
usernames = ''
return subreddits, usernames
[docs]def construct_query(subreddits, usernames, month, score_limit=None):
# Construct a query string
subreddits, usernames = check_input(subreddits, usernames)
if score_limit is None:
score = ""
else:
score = " AND score >= {}".format(int(score_limit))
query = """
SELECT
id,
body,
subreddit,
author,
created_utc,
link_id,
parent_id,
score
FROM [fh-bigquery:reddit_comments.""" + month + """]
WHERE
body != '[deleted]'
AND body != '[removed]'
AND body NOT LIKE '%has been removed%'
AND body NOT LIKE '%has been overwritten%'
AND body NOT LIKE '%performed automatically%'
AND body NOT LIKE '%bot action performed%'
AND body NOT LIKE '%autowikibot%'
AND body NOT LIKE '%I am a bot%'
AND LENGTH(body) > 0""" + score + subreddits + usernames
return query
[docs]def construct_sample_score_query(subreddits, usernames, month, sample_size,
score_limit=None):
# Construct a query with sampling top-scoring comments
subreddits, usernames = check_input(subreddits, usernames)
if score_limit is None:
score = ""
else:
score = " AND score >= {}".format(int(score_limit))
query = """
SELECT
id,
body,
subreddit,
author,
created_utc,
link_id,
parent_id,
score
FROM (
SELECT
id,
body,
subreddit,
author,
created_utc,
link_id,
parent_id,
score,
ROW_NUMBER() OVER(PARTITION BY subreddit ORDER BY score DESC) as pos
FROM [fh-bigquery:reddit_comments.""" + month + """]
WHERE
body != '[deleted]'
AND body != '[removed]'
AND body NOT LIKE '%has been removed%'
AND body NOT LIKE '%has been overwritten%'
AND body NOT LIKE '%performed automatically%'
AND body NOT LIKE '%bot action performed%'
AND body NOT LIKE '%autowikibot%'
AND body NOT LIKE '%I am a bot%'
AND LENGTH(body) > 0""" + score + subreddits + usernames + """
)
WHERE pos <= """ + str(sample_size)
return query
[docs]def construct_sample_query(subreddits, usernames, month, sample_size, score_limit=None):
# Constuct a query string with random sampling
subreddits, usernames = check_input(subreddits, usernames)
if score_limit is None:
score = ""
else:
score = " AND score >= {}".format(int(score_limit))
query = """
SELECT
id,
body,
subreddit,
author,
created_utc,
link_id,
parent_id,
score
FROM (
SELECT
id,
body,
subreddit,
author,
created_utc,
link_id,
parent_id,
score,
RAND() as rnd,
ROW_NUMBER() OVER(PARTITION BY subreddit ORDER BY rnd) as pos
FROM [fh-bigquery:reddit_comments.""" + month + """]
WHERE
body != '[deleted]'
AND body != '[removed]'
AND body NOT LIKE '%has been removed%'
AND body NOT LIKE '%has been overwritten%'
AND body NOT LIKE '%performed automatically%'
AND body NOT LIKE '%bot action performed%'
AND body NOT LIKE '%autowikibot%'
AND body NOT LIKE '%I am a bot%'
AND LENGTH(body) > 0""" + score + subreddits + usernames + """
)
WHERE pos <= """ + str(sample_size)
return query
[docs]def get_comments(timerange, project_id, private_key,
subreddits=None, usernames=None, score_limit=None,
comments_per_month=None, top_scores=False, csv_directory=None,
verbose=False, configuration=None):
"""
Obtain Reddit comments using Google BigQuery
Parameters
----------
timerange: iterable, shape (2,)
Start and end dates in the '%Y_%m' format.
Example: ('2016_08', '2017_02')
project_id: str
Google BigQuery Account project ID
private_key: str
File path to JSON file with service account private key
https://cloud.google.com/bigquery/docs/reference/libraries
subreddits: list, optional
List of subreddit names
usernames: list, optional
List of usernames
score_limit: int, optional
Score limit for comment retrieving. If None, retrieve all comments.
comments_per_month: int, optional
Number of comments to sample from each subbredit per month. If None,
retrieve all comments.
top_scores: bool, optional
If True, sample top-scoring comments in each subreddit
instead of random sampling.
csv_directory: str, optional
CSV directory to save retrieved data. If None, return a DataFrame with
all comments.
verobse: bool, optional
If True, print the name of the table, which is being queried.
configuration: dict, optional
Query config parameters for job processing.
Returns
----------
dfs: list
List of pd.DataFrames with comments
"""
if subreddits and not isinstance(subreddits, list):
raise ValueError(
'subreddits argument must be a list, not {}'.format(type(subreddits)))
if usernames and not isinstance(usernames, list):
raise ValueError(
'usernames argument must be a list, not {}'.format(type(usernames)))
if not usernames and not subreddits:
raise ValueError(
'You have to specify a list of subreddits or a list of usernames')
if (comments_per_month is not None) and \
not isinstance(comments_per_month, int):
raise ValueError('comments_per_month must be an integer, not {}'.format(
type(comments_per_month)))
if (csv_directory is not None) and \
not os.path.isdir(csv_directory):
raise OSError('{} does not exist'.format(csv_directory))
try:
iter(timerange)
except TypeError as e:
raise TypeError('timerange argument must be an iterable') from e
try:
assert len(timerange) == 2
except AssertionError as e:
raise ValueError(
'timerange argument has to contain only two elements') from e
start = datetime.datetime.strptime(timerange[0], '%Y_%m')
end = datetime.datetime.strptime(timerange[1], '%Y_%m')
delta = diff_month(end, start)
if csv_directory is None:
dfs = []
else:
dfs = None
for i in range(delta + 1):
date = add_months(start, i)
table_name = date.strftime('%Y_%m')
if verbose:
print(
'Querying from [fh-bigquery:reddit_comments.{}]'.format(table_name))
if comments_per_month is None:
query = construct_query(
subreddits, usernames, table_name, score_limit)
elif top_scores:
query = construct_sample_score_query(
subreddits, usernames, table_name, comments_per_month, score_limit)
else:
query = construct_sample_query(
subreddits, usernames, table_name, comments_per_month, score_limit)
df = gbq.read_gbq(query, project_id=project_id,
private_key=private_key, configuration=configuration)
if csv_directory is None:
dfs.append(df)
else:
filename = os.path.join(csv_directory, table_name + '.csv')
if os.path.isfile(filename):
with open(filename, 'a') as f:
df.to_csv(f, header=False, index=False)
else:
df.to_csv(os.path.join(csv_directory, table_name + '.csv'),
index=False)
return dfs