-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathparseESPN.py
70 lines (65 loc) · 2.49 KB
/
parseESPN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
'''
Parsing modules specifically for handling ESPN sports pages (tested only on
NBA data thus far);
'''
import sys, os, re
from BeautifulSoup import BeautifulSoup as BS
import genBSandURLtools
null_value = ' '
def getESPNpbp(data_in, mode='url'):
'''
url is a play-by-play url obtained from score-summary ESPN page;
use BeautifulSoup to parse apart data_in; all relevant data found
in 'table' HTML structures, hence we grab those;
'''
tables = genBSandURLtools.getDataType(data_in, 'table', mode)
pbp = [t for t in tables if t.text.find('TIME') > -1]
if pbp:
pbp = pbp[0].findAll('tr')
else:
raise AttributeError, "Houston, there is a fucking problem"
'''Use BS to get the headers (e.g., home and away team for game)'''
header = [str(h.text) for h in pbp[1].findAll('th')] # time, away, score, home
content = []
for line in pbp[2:]:
temp = line.findAll('td')
content.append([str(e.text) for e in temp])
return {'head':header, 'content':content}
def getESPNbox(data_in, mode='url'):
'''
url is a box score url obtained from score-summary ESPN page;
use BeautifulSoup to parse apart data_in; all relevant data found
in 'table' HTML structures, hence we grab those;
'''
tables = genBSandURLtools.getDataType(data_in, 'table', mode)
summary = [t for t in tables if t.text.find('STARTERS') > -1]
if summary:
summary = summary[0].findAll('tr')
else:
raise AttributeError, "Houston, there is a fucking problem"
details = []
content = []
for line in summary:
'''
"details" are headers, teams stuff;
"content" is actual player data
'''
details.append([str(h.text) for h in line.findAll('th')])
content.append([str(h.text) for h in line.findAll('td')])
playerlink_dict = getESPNplayerlinks(summary)
return {'details':details, 'content':content,
'playerlinks':playerlink_dict}
def getESPNplayerlinks(summary):
'''
Gets the ESPN page urls for players in the game from the box score page;
keys are the full names of players used in box score, and values are
the urls;
'''
playerlink_dict = dict()
for line in summary:
temp = line.findAll('a')
if temp:
temp = temp[0]
if str(temp.get('href')):
playerlink_dict[str(temp.text)] = str(temp.get('href'))
return playerlink_dict