-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcsv-processing.py
261 lines (221 loc) · 9.22 KB
/
csv-processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
import json
import requests
import threading
import time
import os
import urllib2
from bs4 import BeautifulSoup
# lock on the post_obj for multithreading purposes
POST_DICT_LOCK = threading.Lock()
# article_name -> wp_post_obj
post_obj = {}
def main():
"""Primary method that processes the CSV data for our Cornell Sun
age classifier."""
global post_obj
# article_title -> {age_range -> total_sessions}
temp_dict = build_article_dict()
# (hashed_dict, total_sessions) = dict_into_buckets(temp_dict)
# TODO: Filter finished dictionaries by the successful responses for posts
# age_group -> number_sessions
total_sessions = count_sessions_by_age(temp_dict)
# article_title -> popular_age_range
classifications = classify_articles(temp_dict, total_sessions)
# write classifications into a text file to be opened in Swift
with open("generated_data/post_classifications.txt", "w+") as data_file:
json.dump(classifications, data_file)
if os.path.isfile("generated_data/posts_data.txt"):
# already generated posts, no need to request more
with open("generated_data/posts_data.txt", "r") as data_file:
post_obj = json.loads(data_file.read())
else:
# populates global post_obj dictionary
retrieve_post_information(classifications)
with open("generated_data/posts_data.txt", "w+") as data_file:
json.dump(post_obj, data_file)
training_data = build_training_data(post_obj)
# write training data into a text file to be opened in Swift
with open("generated_data/training_data.txt", "w+") as data_file:
json.dump(training_data, data_file)
def build_article_dict():
"""Read in the first CSV and sort all entries into article buckets."""
temp_dict = {}
with open("sun-classifier-data/sun-article-sessions.csv") as csv_file:
count = 0
for line in csv_file.readlines():
if count < 8:
count += 1
continue # skip headers
if count == 5007:
break # bottom data not formatted properly
count += 1
# example line:
# 55-64,"""WANG | The Seven Degrees of Cornell""",14,0,0,0,0,14
entry = line.split(',"""')
age_range = entry[0].strip()
second_entry = entry[1].split('""",')
article_name = second_entry[0]
article_name = article_name[:article_name.rfind("|")].strip().decode("utf8", "ignore")
sessions = second_entry[1].split(",")
total_sessions = int(sessions[0].strip())
if article_name in temp_dict:
# article already in dict
article_entry = temp_dict[article_name]
if age_range in article_entry:
# age group represented in dict -> increment
article_entry[age_range] += total_sessions
else:
# age range not represented -> set equal to #sessions
article_entry[age_range] = total_sessions
else:
# article name not represented
temp_dict[article_name] = { age_range: total_sessions }
return temp_dict
def dict_into_buckets(articles):
"""CURRENTLY UNUSED: Group ages into buckets as defined in writeup.
Currently hashes intobuckets by adding all the smaller age group sessions
into the largerbucket, which skews the information towards generally larger
buckets."""
bucket_dict = {}
total_sessions = {
'18-24': 0,
'25-44': 0,
'45+': 0
}
for article_name in articles:
ranges = articles[article_name]
article_ranges_dict = {
'18-24': 0,
'25-44': 0,
'45+': 0
}
for group in ranges:
# Hash into 18-24, 25-44, or 45+
if group == '18-24':
bucket_range = '18-24'
elif group == '25-34' or group == '35-44':
bucket_range = '25-44'
else:
bucket_range = '45+'
article_ranges_dict[bucket_range] += ranges[group]
total_sessions[bucket_range] += ranges[group]
bucket_dict[article_name] = article_ranges_dict
return (bucket_dict, total_sessions)
def count_sessions_by_age(articles):
"""Count how many sessions in each age group."""
total_sessions = {}
for article_name in articles:
sessions_by_age = articles[article_name]
for age_group in sessions_by_age:
sessions = sessions_by_age[age_group]
if age_group in total_sessions:
total_sessions[age_group] += sessions
else:
total_sessions[age_group] = sessions
return total_sessions
def classify_articles(articles, sessions):
"""Return dictionary of ArticleName: AgeGroupClassification depending on
number of sessions per article."""
classification_dict = {}
for article in articles:
buckets = articles[article]
opt_bucket = None
opt_bucket_sessions = 0.0
# get age_range with largest num sessions
for age_group in buckets:
if buckets[age_group] > opt_bucket_sessions:
opt_bucket = age_group
opt_bucket_sessions = buckets[age_group]
# hash into one of the three buckets as classification
if opt_bucket == '18-24':
opt_bucket = '18-24'
elif opt_bucket == '25-34' or opt_bucket == '35-44':
opt_bucket = '25-44'
else:
opt_bucket = '45+'
classification_dict[article] = opt_bucket
return classification_dict
def wp_request_article(article):
"""Search for the wordpress Post object from the REST API given the article
name."""
article = article.decode("utf8", "ignore")
url = "http://cornellsun.com/wp-json/wp/v2/posts?search={}".format(article)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
}
with requests.get(url, headers=headers) as wp_response:
if wp_response.text == "[]":
# no responses for this article :(
print "No article objects found: {}".format(article)
return
try:
wp_post = json.loads(wp_response.text)
except:
print "Exception converting response to JSON: {}".format(article)
return
if "title" not in wp_post[0]:
# title not found?
print "Title not found in response: {}".format(article)
return
elif article != wp_post[0]["title"]["rendered"]:
# titles not equal, no bueno
print "Best result title not equal: {}".format(article)
return
wp_single_post = wp_post[0]
with POST_DICT_LOCK:
post_obj[article] = wp_single_post
def retrieve_post_information(classifications):
"""Run requests on multiple threads to the Wordpress backend. Retrieves the
closest match (if possible) to the unicode title and populates a dictionary
from the article title to the object."""
threads = []
for article in classifications:
thread = threading.Thread(target=wp_request_article, args=[article])
threads.append(thread)
# add 0.75s pause to not overflow the server taking requests
for thread in threads:
thread.start()
time.sleep(0.75)
for thread in threads:
thread.join(30.0)
def build_training_data(posts):
"""Evaluates the features for this article post and stores them in a dict."""
training_data = {}
for article in posts:
post_obj = posts[article]
features = _build_word_count_dict(post_obj)
training_data[article] = features
return training_data
def _build_word_count_dict(post):
"""Returns a dictionary of words to their number of occurrences for a given
post object."""
# TODO: Should filter out empty strings and beginning / ending characters
# not letters or numbers. ex. "(something"
word_counts = {}
soup = BeautifulSoup(post["content"]["rendered"], "html.parser")
for word in soup.get_text().replace("\n", " ").split(" "):
word = word.strip().replace(", ", " ")
if word in word_counts:
word_counts[word] += 1
else:
word_counts[word] = 1
return word_counts
def _build_feature_dict(post):
"""Return a dictionary of features for a given post object."""
features = {}
article_title = post["title"]["rendered"]
features["title_length"] = len(article_title)
features["categories"] = ', '.join(post["category_strings"])
features["tags"] = ', '.join(post["tag_strings"])
features["content_size"] = len(post["content"]["rendered"])
features["num_images"] = len(post["post_attachments_meta"])
features["primary_category"] = post["primary_category"]
num_words = len(article_title.split(" "))
features["average_word_length_title"] = len(article_title) / num_words
pipe_index = article_title.find("|")
if pipe_index == -1:
pipe_index = len(article_title)
features["title_split_on_pipe"] = article_title[:pipe_index].strip()
return features
if __name__ == '__main__':
main()