-
Notifications
You must be signed in to change notification settings - Fork 45
Expand file tree
/
Copy pathscraper.py
More file actions
156 lines (125 loc) · 5 KB
/
scraper.py
File metadata and controls
156 lines (125 loc) · 5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import requests
import json
import datetime
import calendar
import logging
from dateutil import parser, relativedelta
from django.http import HttpResponse
from community.git import get_org_name
class Scraper():
"""
This is the class responsible for scraping provided issues into a
dictionary containing just statistical information of the data.
"""
"""
Count of months/weeks/days respectively to be scraped in past.
"""
CONSTANTS = {
'month_count': 12,
'week_count': 4,
'day_count': 7,
}
def __init__(self, content, date):
"""
Constructs a new ``Scraper``
:param content: Github API Parsed JSON issues
:param date: The date to scrape data till.
"""
logger = logging.getLogger(__name__)
logger.info('this package is alive')
self.date = date
self.content = content
# Initialise data dicts
self.data = {
'year': {
'labels': [],
'closed': [0]*self.CONSTANTS['month_count'],
'opened': [0]*self.CONSTANTS['month_count'],
},
'month': {
'labels': [],
'closed': [0]*self.CONSTANTS['week_count'],
'opened': [0]*self.CONSTANTS['week_count'],
},
'week': {
'labels': [],
'closed': [0]*self.CONSTANTS['day_count'],
'opened': [0]*self.CONSTANTS['day_count'],
},
}
# Process labels for each option
for x in range(self.CONSTANTS['month_count']-1, -1, -1):
self.data['year']['labels'].append(calendar.month_name[(
self.date - relativedelta.relativedelta(months=x)).month])
for x in range(self.CONSTANTS['week_count']-1, -1, -1):
day = self.date - relativedelta.relativedelta(weeks=x)
strt = (day - datetime.timedelta(days=day.weekday()))
fin = (day + datetime.timedelta(days=6-day.weekday()))
self.data['month']['labels'].append(
calendar.month_abbr[strt.month] + ' ' + str(strt.day)
+ ' - '
+ calendar.month_abbr[fin.month] + ' ' + str(fin.day))
for x in range(self.CONSTANTS['day_count']-1, -1, -1):
day_idx = (self.date - datetime.timedelta(days=x)).weekday()
self.data['week']['labels'].append(calendar.day_name[day_idx])
def __diff_month(self, d):
"""
:param d: Date as datetime, self.date >= Date.
:return: Difference in months(int) ignoring partially complete months.
"""
return (self.date.year - d.year) * 12 + self.date.month - d.month
def __diff_week(self, d):
"""
:param d: Date as datetime, self.date >= Date.
:return: Difference in weeks(int) ignoring partially complete weeks.
"""
monday1 = (self.date - datetime.timedelta(days=self.date.weekday()))
monday2 = (d - datetime.timedelta(days=d.weekday()))
return (monday1 - monday2).days // 7
def __diff_days(self, d):
"""
:param d: Date as datetime, self.date >= Date.
:return: Difference in days(int) ignoring partially complete days.
"""
return (self.date-d).days
def get_data(self):
"""
Get data
:return: Data in form of dict containing year, month, week data.
"""
for issue in self.content:
issue = issue['issue']
# Parse date, while ignoring the timestamp.
dt = parser.parse(issue['createdAt'][:10])
mon = self.__diff_month(dt)
if mon < self.CONSTANTS['month_count']:
mon = self.CONSTANTS['month_count'] - mon - 1
self.data['year']['opened'][mon] += 1
if issue['state'] == 'closed':
self.data['year']['closed'][mon] += 1
wk = self.__diff_week(dt)
if wk < self.CONSTANTS['week_count']:
wk = self.CONSTANTS['week_count'] - wk - 1
self.data['month']['opened'][wk] += 1
if issue['state'] == 'closed':
self.data['month']['closed'][wk] += 1
dys = self.__diff_days(dt)
if dys < self.CONSTANTS['day_count']:
dys = self.CONSTANTS['day_count'] - dys - 1
self.data['week']['opened'][dys] += 1
if issue['state'] == 'closed':
self.data['week']['closed'][dys] += 1
return self.data
def activity_json(filename):
org_name = get_org_name()
# URL to grab all issues from
issues_url = 'http://' + org_name + '.github.io/gh-board/issues.json'
content = requests.get(issues_url)
try:
parsed_json = content.json()
except json.JSONDecodeError:
return HttpResponse('{}')
real_data = Scraper(parsed_json['issues'], datetime.datetime.today())
real_data = real_data.get_data()
with open(filename, 'w+') as f:
json.dump(real_data, f, indent=4)