Skip to content

Commit 408f466

Browse files
committed
Updated content after dry run
1 parent 4f4da08 commit 408f466

6 files changed

+1160
-157
lines changed

part0_intro.ipynb

+3-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@
55
"metadata": {},
66
"source": [
77
"# Automating Machine Learning with Python and Azure\n",
8-
"By [Matt Eland](https://MattEland.dev) | [@IntegerMan](https://twitter.com/IntegerMan)"
8+
"By [Matt Eland](https://MattEland.dev) | [@IntegerMan](https://twitter.com/IntegerMan)\n",
9+
"\n",
10+
"**Note:** Full source code for this talk is available at [https://github.com/IntegerMan/AutoMLwithPythonAndAzure](https://github.com/IntegerMan/AutoMLwithPythonAndAzure)"
911
]
1012
},
1113
{

part1_dataprep.ipynb

+72-67
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
},
3232
{
3333
"cell_type": "code",
34-
"execution_count": 1,
34+
"execution_count": 20,
3535
"metadata": {},
3636
"outputs": [
3737
{
@@ -97,7 +97,7 @@
9797
"2000020005 52.0"
9898
]
9999
},
100-
"execution_count": 1,
100+
"execution_count": 20,
101101
"metadata": {},
102102
"output_type": "execute_result"
103103
}
@@ -120,7 +120,7 @@
120120
},
121121
{
122122
"cell_type": "code",
123-
"execution_count": 2,
123+
"execution_count": 33,
124124
"metadata": {},
125125
"outputs": [
126126
{
@@ -205,7 +205,7 @@
205205
"4 2017020586 20172018 R 20 24"
206206
]
207207
},
208-
"execution_count": 2,
208+
"execution_count": 33,
209209
"metadata": {},
210210
"output_type": "execute_result"
211211
}
@@ -220,6 +220,9 @@
220220
"# Historical information is helpful, but does become outdated. Let's look at 2015 onwards, then drop our identifiers that might throw off machine learning\n",
221221
"df_games = df_games[df_games['season'] >= 20152016]\n",
222222
"\n",
223+
"# A few unusual values occur. Make sure we're dealing with playoff or regular season games only\n",
224+
"df_games = df_games[(df_games['type'] == 'R') | (df_games['type'] == 'P')] \n",
225+
"\n",
223226
"# Remap the season to a string type\n",
224227
"df_games['season'] = df_games['season'].map(str)\n",
225228
"\n",
@@ -229,7 +232,7 @@
229232
},
230233
{
231234
"cell_type": "code",
232-
"execution_count": 3,
235+
"execution_count": 45,
233236
"metadata": {},
234237
"outputs": [
235238
{
@@ -263,64 +266,64 @@
263266
" </thead>\n",
264267
" <tbody>\n",
265268
" <tr>\n",
266-
" <th>0</th>\n",
267-
" <td>2000020001</td>\n",
268-
" <td>42.0</td>\n",
269-
" <td>NaN</td>\n",
270-
" <td>NaN</td>\n",
271-
" <td>NaN</td>\n",
272-
" <td>NaN</td>\n",
269+
" <th>26292</th>\n",
270+
" <td>2019030414</td>\n",
271+
" <td>36.0</td>\n",
272+
" <td>20192020</td>\n",
273+
" <td>P</td>\n",
274+
" <td>14.0</td>\n",
275+
" <td>25.0</td>\n",
273276
" </tr>\n",
274277
" <tr>\n",
275-
" <th>1</th>\n",
276-
" <td>2000020002</td>\n",
277-
" <td>32.0</td>\n",
278-
" <td>NaN</td>\n",
279-
" <td>NaN</td>\n",
280-
" <td>NaN</td>\n",
281-
" <td>NaN</td>\n",
278+
" <th>26293</th>\n",
279+
" <td>2019030415</td>\n",
280+
" <td>12.0</td>\n",
281+
" <td>20192020</td>\n",
282+
" <td>P</td>\n",
283+
" <td>25.0</td>\n",
284+
" <td>14.0</td>\n",
282285
" </tr>\n",
283286
" <tr>\n",
284-
" <th>2</th>\n",
285-
" <td>2000020003</td>\n",
286-
" <td>45.0</td>\n",
287-
" <td>NaN</td>\n",
288-
" <td>NaN</td>\n",
289-
" <td>NaN</td>\n",
290-
" <td>NaN</td>\n",
287+
" <th>26294</th>\n",
288+
" <td>2019030415</td>\n",
289+
" <td>12.0</td>\n",
290+
" <td>20192020</td>\n",
291+
" <td>P</td>\n",
292+
" <td>25.0</td>\n",
293+
" <td>14.0</td>\n",
291294
" </tr>\n",
292295
" <tr>\n",
293-
" <th>3</th>\n",
294-
" <td>2000020004</td>\n",
296+
" <th>26295</th>\n",
297+
" <td>2019030416</td>\n",
295298
" <td>24.0</td>\n",
296-
" <td>NaN</td>\n",
297-
" <td>NaN</td>\n",
298-
" <td>NaN</td>\n",
299-
" <td>NaN</td>\n",
299+
" <td>20192020</td>\n",
300+
" <td>P</td>\n",
301+
" <td>14.0</td>\n",
302+
" <td>25.0</td>\n",
300303
" </tr>\n",
301304
" <tr>\n",
302-
" <th>4</th>\n",
303-
" <td>2000020005</td>\n",
304-
" <td>52.0</td>\n",
305-
" <td>NaN</td>\n",
306-
" <td>NaN</td>\n",
307-
" <td>NaN</td>\n",
308-
" <td>NaN</td>\n",
305+
" <th>26296</th>\n",
306+
" <td>2019030416</td>\n",
307+
" <td>24.0</td>\n",
308+
" <td>20192020</td>\n",
309+
" <td>P</td>\n",
310+
" <td>14.0</td>\n",
311+
" <td>25.0</td>\n",
309312
" </tr>\n",
310313
" </tbody>\n",
311314
"</table>\n",
312315
"</div>"
313316
],
314317
"text/plain": [
315-
" game_id penaltyMinutes season type away_team_id home_team_id\n",
316-
"0 2000020001 42.0 NaN NaN NaN NaN\n",
317-
"1 2000020002 32.0 NaN NaN NaN NaN\n",
318-
"2 2000020003 45.0 NaN NaN NaN NaN\n",
319-
"3 2000020004 24.0 NaN NaN NaN NaN\n",
320-
"4 2000020005 52.0 NaN NaN NaN NaN"
318+
" game_id penaltyMinutes season type away_team_id home_team_id\n",
319+
"26292 2019030414 36.0 20192020 P 14.0 25.0\n",
320+
"26293 2019030415 12.0 20192020 P 25.0 14.0\n",
321+
"26294 2019030415 12.0 20192020 P 25.0 14.0\n",
322+
"26295 2019030416 24.0 20192020 P 14.0 25.0\n",
323+
"26296 2019030416 24.0 20192020 P 14.0 25.0"
321324
]
322325
},
323-
"execution_count": 3,
326+
"execution_count": 45,
324327
"metadata": {},
325328
"output_type": "execute_result"
326329
}
@@ -329,12 +332,15 @@
329332
"# Aggregate everything together\n",
330333
"df_detailed_pens = pd.merge(left=df_team_stats, right=df_games, how='left', left_on='game_id', right_on='game_id')\n",
331334
"\n",
332-
"df_detailed_pens.head()"
335+
"# Drop NaN occurrences\n",
336+
"df_detailed_pens = df_detailed_pens.dropna()\n",
337+
"\n",
338+
"df_detailed_pens.tail()"
333339
]
334340
},
335341
{
336342
"cell_type": "code",
337-
"execution_count": 4,
343+
"execution_count": 41,
338344
"metadata": {},
339345
"outputs": [
340346
{
@@ -401,7 +407,7 @@
401407
"4 6 Bruins"
402408
]
403409
},
404-
"execution_count": 4,
410+
"execution_count": 41,
405411
"metadata": {},
406412
"output_type": "execute_result"
407413
}
@@ -414,7 +420,7 @@
414420
},
415421
{
416422
"cell_type": "code",
417-
"execution_count": 5,
423+
"execution_count": 46,
418424
"metadata": {},
419425
"outputs": [
420426
{
@@ -505,20 +511,20 @@
505511
"4 2017021101 10.0 20172018 R Maple Leafs Canadiens"
506512
]
507513
},
508-
"execution_count": 5,
514+
"execution_count": 46,
509515
"metadata": {},
510516
"output_type": "execute_result"
511517
}
512518
],
513519
"source": [
514520
"# Now get the home and away teams into the detailed penalties dataframe\n",
515521
"df_detailed_pens = pd.merge(left=df_detailed_pens, right=df_teams, how='inner', left_on='home_team_id', right_on='team_id')\n",
516-
"df_detailed_pens = df_detailed_pens.rename(columns={'teamName': 'homeTeam'})\n",
517-
"df_detailed_pens = df_detailed_pens.drop(['home_team_id','team_id'], axis=1)\n",
522+
"df_detailed_pens.rename(columns={'teamName': 'homeTeam'}, inplace=True)\n",
523+
"df_detailed_pens.drop(columns=['home_team_id','team_id'], axis=1, inplace=True)\n",
518524
"\n",
519525
"df_detailed_pens = pd.merge(left=df_detailed_pens, right=df_teams, how='inner', left_on='away_team_id', right_on='team_id')\n",
520-
"df_detailed_pens = df_detailed_pens.rename(columns={'teamName': 'awayTeam'})\n",
521-
"df_detailed_pens = df_detailed_pens.drop(['away_team_id','team_id'], axis=1)\n",
526+
"df_detailed_pens.rename(columns={'teamName': 'awayTeam'}, inplace=True)\n",
527+
"df_detailed_pens.drop(columns=['away_team_id','team_id'], axis=1, inplace=True)\n",
522528
"\n",
523529
"df_detailed_pens.head()"
524530
]
@@ -533,7 +539,7 @@
533539
},
534540
{
535541
"cell_type": "code",
536-
"execution_count": 6,
542+
"execution_count": 47,
537543
"metadata": {},
538544
"outputs": [
539545
{
@@ -550,7 +556,7 @@
550556
"Name: penaltyMinutes, dtype: float64"
551557
]
552558
},
553-
"execution_count": 6,
559+
"execution_count": 47,
554560
"metadata": {},
555561
"output_type": "execute_result"
556562
}
@@ -562,7 +568,7 @@
562568
},
563569
{
564570
"cell_type": "code",
565-
"execution_count": 8,
571+
"execution_count": 48,
566572
"metadata": {},
567573
"outputs": [],
568574
"source": [
@@ -578,7 +584,7 @@
578584
},
579585
{
580586
"cell_type": "code",
581-
"execution_count": 19,
587+
"execution_count": 49,
582588
"metadata": {},
583589
"outputs": [
584590
{
@@ -65572,7 +65578,7 @@
6557265578
},
6557365579
{
6557465580
"cell_type": "code",
65575-
"execution_count": 13,
65581+
"execution_count": 51,
6557665582
"metadata": {},
6557765583
"outputs": [
6557865584
{
@@ -139811,7 +139817,7 @@
139811139817
},
139812139818
{
139813139819
"cell_type": "code",
139814-
"execution_count": 27,
139820+
"execution_count": 52,
139815139821
"metadata": {},
139816139822
"outputs": [
139817139823
{
@@ -139890,7 +139896,7 @@
139890139896
"4 10.0 R Maple Leafs Canadiens"
139891139897
]
139892139898
},
139893-
"execution_count": 27,
139899+
"execution_count": 52,
139894139900
"metadata": {},
139895139901
"output_type": "execute_result"
139896139902
}
@@ -139903,7 +139909,7 @@
139903139909
},
139904139910
{
139905139911
"cell_type": "code",
139906-
"execution_count": 28,
139912+
"execution_count": 53,
139907139913
"metadata": {},
139908139914
"outputs": [
139909139915
{
@@ -139920,7 +139926,6 @@
139920139926
"# Connect to the workspace by reading from the config.json file (downloaded from the Machine Learning page in the portal)\n",
139921139927
"from azureml.core import Workspace, Dataset\n",
139922139928
"\n",
139923-
"\n",
139924139929
"ws = Workspace.from_config() # config.json is needed in this folder for this to work\n",
139925139930
"\n",
139926139931
"# Display high-level info on the workspace\n",
@@ -139931,7 +139936,7 @@
139931139936
},
139932139937
{
139933139938
"cell_type": "code",
139934-
"execution_count": 39,
139939+
"execution_count": 54,
139935139940
"metadata": {},
139936139941
"outputs": [
139937139942
{
@@ -139941,11 +139946,11 @@
139941139946
"Validating arguments.\n",
139942139947
"Arguments validated.\n",
139943139948
"Successfully obtained datastore reference and path.\n",
139944-
"Uploading file to managed-dataset/783d4c3f-e99e-47b9-ac1d-df55a2564c72/\n",
139949+
"Uploading file to managed-dataset/edbbcf3b-8a61-49d3-b9a0-040a45e85c16/\n",
139945139950
"Successfully uploaded file to datastore.\n",
139946139951
"Creating and registering a new dataset.\n",
139947139952
"Successfully created and registered a new dataset.\n",
139948-
"NHL-Penalties-2020 v6 (ID: 69058fd8-167e-4e66-9aa8-733590aa2e4c)\n"
139953+
"NHL-Penalties-2020 v7 (ID: 8b25d268-7a7a-40d4-b496-4c016aeaaca0)\n"
139949139954
]
139950139955
}
139951139956
],

0 commit comments

Comments
 (0)