31
31
},
32
32
{
33
33
"cell_type": "code",
34
- "execution_count": 1 ,
34
+ "execution_count": 20 ,
35
35
"metadata": {},
36
36
"outputs": [
37
37
{
97
97
"2000020005 52.0"
98
98
]
99
99
},
100
- "execution_count": 1 ,
100
+ "execution_count": 20 ,
101
101
"metadata": {},
102
102
"output_type": "execute_result"
103
103
}
120
120
},
121
121
{
122
122
"cell_type": "code",
123
- "execution_count": 2 ,
123
+ "execution_count": 33 ,
124
124
"metadata": {},
125
125
"outputs": [
126
126
{
205
205
"4 2017020586 20172018 R 20 24"
206
206
]
207
207
},
208
- "execution_count": 2 ,
208
+ "execution_count": 33 ,
209
209
"metadata": {},
210
210
"output_type": "execute_result"
211
211
}
220
220
"# Historical information is helpful, but does become outdated. Let's look at 2015 onwards, then drop our identifiers that might throw off machine learning\n",
221
221
"df_games = df_games[df_games['season'] >= 20152016]\n",
222
222
"\n",
223
+ "# A few unusual values occur. Make sure we're dealing with playoff or regular season games only\n",
224
+ "df_games = df_games[(df_games['type'] == 'R') | (df_games['type'] == 'P')] \n",
225
+ "\n",
223
226
"# Remap the season to a string type\n",
224
227
"df_games['season'] = df_games['season'].map(str)\n",
225
228
"\n",
229
232
},
230
233
{
231
234
"cell_type": "code",
232
- "execution_count": 3 ,
235
+ "execution_count": 45 ,
233
236
"metadata": {},
234
237
"outputs": [
235
238
{
263
266
" </thead>\n",
264
267
" <tbody>\n",
265
268
" <tr>\n",
266
- " <th>0 </th>\n",
267
- " <td>2000020001 </td>\n",
268
- " <td>42 .0</td>\n",
269
- " <td>NaN </td>\n",
270
- " <td>NaN </td>\n",
271
- " <td>NaN </td>\n",
272
- " <td>NaN </td>\n",
269
+ " <th>26292 </th>\n",
270
+ " <td>2019030414 </td>\n",
271
+ " <td>36 .0</td>\n",
272
+ " <td>20192020 </td>\n",
273
+ " <td>P </td>\n",
274
+ " <td>14.0 </td>\n",
275
+ " <td>25.0 </td>\n",
273
276
" </tr>\n",
274
277
" <tr>\n",
275
- " <th>1 </th>\n",
276
- " <td>2000020002 </td>\n",
277
- " <td>32 .0</td>\n",
278
- " <td>NaN </td>\n",
279
- " <td>NaN </td>\n",
280
- " <td>NaN </td>\n",
281
- " <td>NaN </td>\n",
278
+ " <th>26293 </th>\n",
279
+ " <td>2019030415 </td>\n",
280
+ " <td>12 .0</td>\n",
281
+ " <td>20192020 </td>\n",
282
+ " <td>P </td>\n",
283
+ " <td>25.0 </td>\n",
284
+ " <td>14.0 </td>\n",
282
285
" </tr>\n",
283
286
" <tr>\n",
284
- " <th>2 </th>\n",
285
- " <td>2000020003 </td>\n",
286
- " <td>45 .0</td>\n",
287
- " <td>NaN </td>\n",
288
- " <td>NaN </td>\n",
289
- " <td>NaN </td>\n",
290
- " <td>NaN </td>\n",
287
+ " <th>26294 </th>\n",
288
+ " <td>2019030415 </td>\n",
289
+ " <td>12 .0</td>\n",
290
+ " <td>20192020 </td>\n",
291
+ " <td>P </td>\n",
292
+ " <td>25.0 </td>\n",
293
+ " <td>14.0 </td>\n",
291
294
" </tr>\n",
292
295
" <tr>\n",
293
- " <th>3 </th>\n",
294
- " <td>2000020004 </td>\n",
296
+ " <th>26295 </th>\n",
297
+ " <td>2019030416 </td>\n",
295
298
" <td>24.0</td>\n",
296
- " <td>NaN </td>\n",
297
- " <td>NaN </td>\n",
298
- " <td>NaN </td>\n",
299
- " <td>NaN </td>\n",
299
+ " <td>20192020 </td>\n",
300
+ " <td>P </td>\n",
301
+ " <td>14.0 </td>\n",
302
+ " <td>25.0 </td>\n",
300
303
" </tr>\n",
301
304
" <tr>\n",
302
- " <th>4 </th>\n",
303
- " <td>2000020005 </td>\n",
304
- " <td>52 .0</td>\n",
305
- " <td>NaN </td>\n",
306
- " <td>NaN </td>\n",
307
- " <td>NaN </td>\n",
308
- " <td>NaN </td>\n",
305
+ " <th>26296 </th>\n",
306
+ " <td>2019030416 </td>\n",
307
+ " <td>24 .0</td>\n",
308
+ " <td>20192020 </td>\n",
309
+ " <td>P </td>\n",
310
+ " <td>14.0 </td>\n",
311
+ " <td>25.0 </td>\n",
309
312
" </tr>\n",
310
313
" </tbody>\n",
311
314
"</table>\n",
312
315
"</div>"
313
316
],
314
317
"text/plain": [
315
- " game_id penaltyMinutes season type away_team_id home_team_id\n",
316
- "0 2000020001 42 .0 NaN NaN NaN NaN \n",
317
- "1 2000020002 32 .0 NaN NaN NaN NaN \n",
318
- "2 2000020003 45 .0 NaN NaN NaN NaN \n",
319
- "3 2000020004 24.0 NaN NaN NaN NaN \n",
320
- "4 2000020005 52 .0 NaN NaN NaN NaN "
318
+ " game_id penaltyMinutes season type away_team_id home_team_id\n",
319
+ "26292 2019030414 36 .0 20192020 P 14.0 25.0 \n",
320
+ "26293 2019030415 12 .0 20192020 P 25.0 14.0 \n",
321
+ "26294 2019030415 12 .0 20192020 P 25.0 14.0 \n",
322
+ "26295 2019030416 24.0 20192020 P 14.0 25.0 \n",
323
+ "26296 2019030416 24 .0 20192020 P 14.0 25.0 "
321
324
]
322
325
},
323
- "execution_count": 3 ,
326
+ "execution_count": 45 ,
324
327
"metadata": {},
325
328
"output_type": "execute_result"
326
329
}
329
332
"# Aggregate everything together\n",
330
333
"df_detailed_pens = pd.merge(left=df_team_stats, right=df_games, how='left', left_on='game_id', right_on='game_id')\n",
331
334
"\n",
332
- "df_detailed_pens.head()"
335
+ "# Drop NaN occurrences\n",
336
+ "df_detailed_pens = df_detailed_pens.dropna()\n",
337
+ "\n",
338
+ "df_detailed_pens.tail()"
333
339
]
334
340
},
335
341
{
336
342
"cell_type": "code",
337
- "execution_count": 4 ,
343
+ "execution_count": 41 ,
338
344
"metadata": {},
339
345
"outputs": [
340
346
{
401
407
"4 6 Bruins"
402
408
]
403
409
},
404
- "execution_count": 4 ,
410
+ "execution_count": 41 ,
405
411
"metadata": {},
406
412
"output_type": "execute_result"
407
413
}
414
420
},
415
421
{
416
422
"cell_type": "code",
417
- "execution_count": 5 ,
423
+ "execution_count": 46 ,
418
424
"metadata": {},
419
425
"outputs": [
420
426
{
505
511
"4 2017021101 10.0 20172018 R Maple Leafs Canadiens"
506
512
]
507
513
},
508
- "execution_count": 5 ,
514
+ "execution_count": 46 ,
509
515
"metadata": {},
510
516
"output_type": "execute_result"
511
517
}
512
518
],
513
519
"source": [
514
520
"# Now get the home and away teams into the detailed penalties dataframe\n",
515
521
"df_detailed_pens = pd.merge(left=df_detailed_pens, right=df_teams, how='inner', left_on='home_team_id', right_on='team_id')\n",
516
- "df_detailed_pens = df_detailed_pens .rename(columns={'teamName': 'homeTeam'})\n",
517
- "df_detailed_pens = df_detailed_pens .drop(['home_team_id','team_id'], axis=1)\n",
522
+ "df_detailed_pens.rename(columns={'teamName': 'homeTeam'}, inplace=True )\n",
523
+ "df_detailed_pens.drop(columns= ['home_team_id','team_id'], axis=1, inplace=True )\n",
518
524
"\n",
519
525
"df_detailed_pens = pd.merge(left=df_detailed_pens, right=df_teams, how='inner', left_on='away_team_id', right_on='team_id')\n",
520
- "df_detailed_pens = df_detailed_pens .rename(columns={'teamName': 'awayTeam'})\n",
521
- "df_detailed_pens = df_detailed_pens .drop(['away_team_id','team_id'], axis=1)\n",
526
+ "df_detailed_pens.rename(columns={'teamName': 'awayTeam'}, inplace=True )\n",
527
+ "df_detailed_pens.drop(columns= ['away_team_id','team_id'], axis=1, inplace=True )\n",
522
528
"\n",
523
529
"df_detailed_pens.head()"
524
530
]
533
539
},
534
540
{
535
541
"cell_type": "code",
536
- "execution_count": 6 ,
542
+ "execution_count": 47 ,
537
543
"metadata": {},
538
544
"outputs": [
539
545
{
550
556
"Name: penaltyMinutes, dtype: float64"
551
557
]
552
558
},
553
- "execution_count": 6 ,
559
+ "execution_count": 47 ,
554
560
"metadata": {},
555
561
"output_type": "execute_result"
556
562
}
562
568
},
563
569
{
564
570
"cell_type": "code",
565
- "execution_count": 8 ,
571
+ "execution_count": 48 ,
566
572
"metadata": {},
567
573
"outputs": [],
568
574
"source": [
578
584
},
579
585
{
580
586
"cell_type": "code",
581
- "execution_count": 19 ,
587
+ "execution_count": 49 ,
582
588
"metadata": {},
583
589
"outputs": [
584
590
{
65572
65578
},
65573
65579
{
65574
65580
"cell_type": "code",
65575
- "execution_count": 13 ,
65581
+ "execution_count": 51 ,
65576
65582
"metadata": {},
65577
65583
"outputs": [
65578
65584
{
@@ -139811,7 +139817,7 @@
139811
139817
},
139812
139818
{
139813
139819
"cell_type": "code",
139814
- "execution_count": 27 ,
139820
+ "execution_count": 52 ,
139815
139821
"metadata": {},
139816
139822
"outputs": [
139817
139823
{
@@ -139890,7 +139896,7 @@
139890
139896
"4 10.0 R Maple Leafs Canadiens"
139891
139897
]
139892
139898
},
139893
- "execution_count": 27 ,
139899
+ "execution_count": 52 ,
139894
139900
"metadata": {},
139895
139901
"output_type": "execute_result"
139896
139902
}
@@ -139903,7 +139909,7 @@
139903
139909
},
139904
139910
{
139905
139911
"cell_type": "code",
139906
- "execution_count": 28 ,
139912
+ "execution_count": 53 ,
139907
139913
"metadata": {},
139908
139914
"outputs": [
139909
139915
{
@@ -139920,7 +139926,6 @@
139920
139926
"# Connect to the workspace by reading from the config.json file (downloaded from the Machine Learning page in the portal)\n",
139921
139927
"from azureml.core import Workspace, Dataset\n",
139922
139928
"\n",
139923
- "\n",
139924
139929
"ws = Workspace.from_config() # config.json is needed in this folder for this to work\n",
139925
139930
"\n",
139926
139931
"# Display high-level info on the workspace\n",
@@ -139931,7 +139936,7 @@
139931
139936
},
139932
139937
{
139933
139938
"cell_type": "code",
139934
- "execution_count": 39 ,
139939
+ "execution_count": 54 ,
139935
139940
"metadata": {},
139936
139941
"outputs": [
139937
139942
{
@@ -139941,11 +139946,11 @@
139941
139946
"Validating arguments.\n",
139942
139947
"Arguments validated.\n",
139943
139948
"Successfully obtained datastore reference and path.\n",
139944
- "Uploading file to managed-dataset/783d4c3f-e99e-47b9-ac1d-df55a2564c72 /\n",
139949
+ "Uploading file to managed-dataset/edbbcf3b-8a61-49d3-b9a0-040a45e85c16 /\n",
139945
139950
"Successfully uploaded file to datastore.\n",
139946
139951
"Creating and registering a new dataset.\n",
139947
139952
"Successfully created and registered a new dataset.\n",
139948
- "NHL-Penalties-2020 v6 (ID: 69058fd8-167e-4e66-9aa8-733590aa2e4c )\n"
139953
+ "NHL-Penalties-2020 v7 (ID: 8b25d268-7a7a-40d4-b496-4c016aeaaca0 )\n"
139949
139954
]
139950
139955
}
139951
139956
],
0 commit comments