-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathget.py
113 lines (84 loc) · 3.31 KB
/
get.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/env python
import mechanicalsoup
import time
import pandas as pd
import os
from tqdm import tqdm
browser = mechanicalsoup.StatefulBrowser()
def get_overview(id):
"""Get the overview stats from the overview page"""
browser.open(f"https://www.premierleague.com/players/{id}/player/overview")
page = browser.get_current_page()
try:
name = str(page.find("div", class_="name t-colour").string)
except AttributeError:
return None, None, None, None, None, None
try:
nationality = str(page.find("span", class_="playerCountry").string)
except AttributeError:
nationality = None
position = str(page.find("div", class_="info").string)
if position == 'None':
"""For some newer players, website format changes"""
position = page.find_all("div", class_="info")[1].text
clubs = []
clubs_result = page.find_all("span", class_="long")
for club_result in clubs_result:
if club_result.string not in clubs:
clubs.append(str(club_result.string))
seasons = []
seasons_result = page.find_all("td", class_="season")
for season_result in seasons_result:
if season_result.string not in seasons:
seasons.append(str(season_result.string))
num_of_seasons = len(seasons)
return name, nationality, position, clubs, seasons, num_of_seasons
def get_stats(id):
"""Get the detailed stats from the stats page"""
browser.open(f"https://www.premierleague.com/players/{id}/player/stats")
page = browser.get_current_page()
try:
goals = int(page.find("span", class_="allStatContainer statgoals").string)
except AttributeError:
return None, None, None, None, None, None
assists = int(page.find("span", class_="allStatContainer statgoal_assist").string)
try:
clean_sheets = int(page.find("span", class_="allStatContainer statclean_sheet").string)
except AttributeError:
clean_sheets = 0
apps = int(page.find_all("span", class_="allStatContainer statappearances")[0].string)
wins = int(page.find_all("span", class_="allStatContainer statwins")[0].string)
losses = int(page.find_all("span", class_="allStatContainer statlosses")[0].string)
return clean_sheets, goals, assists, apps, wins, losses
try:
"""Try and load a previous dataframe first"""
df = pd.read_csv(os.path.join(os.getcwd(), 'data/premier_league_player_stats.csv'))
except FileNotFoundError:
"""If none exist then create a blank dataframe"""
df = pd.DataFrame(columns=[
'id', 'name', 'position', 'nationality', 'clubs',
'seasons', 'num_of_seasons', 'apps', 'wins',
'losses', 'clean_sheets', 'assists', 'goals'
])
for id in tqdm(range(1, 17000)):
"""Run for all players"""
if id not in df.id.to_list():
"""Only run for players not already collected"""
name, nationality, position, clubs, seasons, num_of_seasons = get_overview(id)
if name is not None:
clean_sheets, goals, assists, apps, wins, losses = get_stats(id)
df_tmp = pd.DataFrame(
[[
id, name, position, nationality, clubs, seasons,
num_of_seasons, apps, wins, losses, clean_sheets,
assists, goals
]],
columns = [
'id', 'name', 'position', 'nationality', 'clubs', 'seasons',
'num_of_seasons', 'apps', 'wins', 'losses', 'clean_sheets',
'assists', 'goals'])
df = df.append(df_tmp)
if id % 100 == 0:
"""Save every 100 calls"""
time.sleep(5)
df.to_csv(os.path.join(os.getcwd(), 'data/premier_league_player_stats.csv'), index=False)