-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathgetESPNDataNBA.py
181 lines (164 loc) · 6.7 KB
/
getESPNDataNBA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
"""
This file grabs complete play-by-play pages and box score pages for games
specified; run from terminal with:
python BBall_GetESPNData.py date|gameidfile|gameid [outputname]
where the first arg can be: a date with the form YYYYMMDD, a file containing
a list of ESPN game ids, or a single ESPN game id; the second, optional arg
is the root name of the output pickle files, one for the play-by-play raw
pages, and the other for the box score raw pages; data format is dictionaries
with the ESPN game ids as keys and the raw pages as values; if a date is used
as input, the program attempts to locate that page, and extracts the
ESPN game ids from the scores summary page for that date;
"""
import sys, os
import re
import datetime
import urllib2
from BeautifulSoup import BeautifulSoup as Soup
import parseESPN
from argshandle import getargs
from picklingIsEasy import picklehandle
'''
nba_root = "http://scores.espn.go.com/nba/scoreboard?date=" + date
nba_pbp_all = "http://scores.espn.go.com/nba/playbyplay?gameId=" + gameID + "&period=0"
nba_box = "http://scores.espn.go.com/nba/boxscore?gameId=" + gameID
ncaa_root = "http://scores.espn.go.com/ncb/scoreboard?date=" + date
'''
'''Links and paths'''
nba_root = "http://scores.espn.go.com/nba/scoreboard?date="
ncaa_root = "http://scores.espn.go.com/ncb/scoreboard?date="
nba_ext = "http://scores.espn.go.com/nba/recap?gameId="
nba_box = "http://scores.espn.go.com/nba/boxscore?gameId="
nba_pbp = "http://scores.espn.go.com/nba/playbyplay?gameId="
nba_shots = "http://scores.espn.go.com/nba/shotchart?gameId="
default_path = "/Users/sinn/NBA-Data-Stuff/DataFiles"
root_dict = {'NBA':nba_root,
'NCAM':ncaa_root
}
null_value = ' '
max_args = 2
def runmain(gameids, argdict):
pbp_store = dict()
box_store = dict()
ext_store = dict()
'''Grab data from pages'''
for gameid in gameids:
print('Grabbing game ' + str(gameid) + '...')
pbp_store[gameid] = getpbp(gameid)
box_store[gameid] = getbox(gameid)
ext_store[gameid] = getext(gameid)
picklehandle({'pbp':pbp_store,
'box':box_store,
'ext':ext_store},
argdict)
return 1
def getext(gameID):
'''
Really this is the recap page, but also grabs some other info like game
location and time, etc; also story analysis of game;
'''
try:
url = nba_ext + str(gameid)
ext = parseESPN.processESPNpage(url, 'extta')
return ext
except ValueError:
# need some stuff to spit out error info...
print('Failed to retreive recap for game ' + str(gameid))
return list()
def getpbp(gameid):
'''
Given an ESPN game ID grabs the raw play-by-play feed page if mode==1;
if mode==2, processes the page with parseESPN.getESPNpbp module;
'''
try:
url = nba_pbp + str(gameid) + "&period=0"
pbp = parseESPN.processESPNpage(url, 'pbp')
return pbp
except ValueError:
# need some stuff to spit out error info...
print('Failed to retreive play-by-play for game ' + str(gameid))
return list()
def getbox(gameid):
'''
Given an ESPN game ID grabs the raw bow score feed page if mode==1;
if mode==2, processes the page with parseESPN.getESPNbox module;
'''
try:
url = nba_box + str(gameid)
box = parseESPN.processESPNpage(url, 'box')
return box
except ValueError:
# need some stuff to spit out error info...
print('Failed to retreive box score for game ' + str(gameid))
return list()
'''These two modules handle obtaining the game ids we want to grabs pages for'''
def getidsfile(fhandle):
with open(fhandle, 'r') as f1:
raw = f1.read()
try:
gameids = [int(gameid) for gameid in raw.split('\n')]
except ValueError:
'''Not all ids in file are valid...'''
print('Some game ids are not valid; removing invalid ids')
gameids = list()
for gameid in raw.split('\n'):
try: gameids.append(int(gameid))
except ValueError: pass
return gameids
def getidswebs(date, cat='NBA'):
'''Code for parsing main scores page, given a date and category'''
key_phrase = re.compile(r'''var thisGame = new gameObj\("(\d{7,12})".*\)''')
if verifydate(date):
date_formatted = date[4:6] + ' ' + date[6:] + ', ' + date[:4]
print "Attempting to get %s page from %s" % (cat, date_formatted)
try:
raw_day_summary = urllib2.urlopen(root_dict[cat]+date).read()
except KeyError:
print 'Non-valid category, %s, provided; using "NBA"' % cat
try:
raw_day_summary = urllib2.urlopen(root_dict['NBA']+date).read()
except urllib2.URLError:
print 'Failed to fetch ' + root_dict['NBA']+date
except urllib2.URLError:
print 'Failed to fetch ' + root_dict[cat]+date
finally:
gameids = key_phrase.findall(raw_day_summary)
return gameids
''''''
def verifydate(date):
'''Checks to make sure provided date is valid format, in past or now'''
now = datetime.datetime.now()
if len(date) != 8:
print 'WARNING: non-valid date or date in invalid format'
try:
if int(date[:4]) <= now.year and\
int(date[4:6]) <= now.month and\
int(date[6:]) <= now.day:
return True
else:
return False
except ValueError:
print 'non-valid date or date in invalid format'
if __name__=='__main__':
"""
Default run from terminal; grab the text file with a list of game
id's and get the raw pbp and box score pages for each; pickle results
"""
argdict = getargs(argslist=['file', 'date', 'outname', 'outform'])
if not argdict.has_key('outform'): argdict['outform'] = 'processed'
if not argdict.has_key('outname'): argdict['outname'] = "temp01_PBP.pkl"
if argdict:
if argdict.has_key('file'):
if os.path.isfile(argdict['file']):
gameids = getidsfile(argdict['file'])
elif os.path.isfile(os.path.join(default_path, argdict['file'])):
gameids = getidsfile(os.path.join(default_path, argdict['file']))
elif argdict.has_key('date'):
gameids = getidswebs(argdict['date'])
if not gameids:
msg = 'No valid game ids provided. Terminating program.'
raise ValueError, msg
else:
'''If everything is OK up to this point, run the main code'''
if runmain(gameids, argdict):
print "Process complete."