3131 },
3232 {
3333 "cell_type": "code",
34- "execution_count": 1 ,
34+ "execution_count": 20 ,
3535 "metadata": {},
3636 "outputs": [
3737 {
9797 "2000020005 52.0"
9898 ]
9999 },
100- "execution_count": 1 ,
100+ "execution_count": 20 ,
101101 "metadata": {},
102102 "output_type": "execute_result"
103103 }
120120 },
121121 {
122122 "cell_type": "code",
123- "execution_count": 2 ,
123+ "execution_count": 33 ,
124124 "metadata": {},
125125 "outputs": [
126126 {
205205 "4 2017020586 20172018 R 20 24"
206206 ]
207207 },
208- "execution_count": 2 ,
208+ "execution_count": 33 ,
209209 "metadata": {},
210210 "output_type": "execute_result"
211211 }
220220 "# Historical information is helpful, but does become outdated. Let's look at 2015 onwards, then drop our identifiers that might throw off machine learning\n",
221221 "df_games = df_games[df_games['season'] >= 20152016]\n",
222222 "\n",
223+ "# A few unusual values occur. Make sure we're dealing with playoff or regular season games only\n",
224+ "df_games = df_games[(df_games['type'] == 'R') | (df_games['type'] == 'P')] \n",
225+ "\n",
223226 "# Remap the season to a string type\n",
224227 "df_games['season'] = df_games['season'].map(str)\n",
225228 "\n",
229232 },
230233 {
231234 "cell_type": "code",
232- "execution_count": 3 ,
235+ "execution_count": 45 ,
233236 "metadata": {},
234237 "outputs": [
235238 {
263266 " </thead>\n",
264267 " <tbody>\n",
265268 " <tr>\n",
266- " <th>0 </th>\n",
267- " <td>2000020001 </td>\n",
268- " <td>42 .0</td>\n",
269- " <td>NaN </td>\n",
270- " <td>NaN </td>\n",
271- " <td>NaN </td>\n",
272- " <td>NaN </td>\n",
269+ " <th>26292 </th>\n",
270+ " <td>2019030414 </td>\n",
271+ " <td>36 .0</td>\n",
272+ " <td>20192020 </td>\n",
273+ " <td>P </td>\n",
274+ " <td>14.0 </td>\n",
275+ " <td>25.0 </td>\n",
273276 " </tr>\n",
274277 " <tr>\n",
275- " <th>1 </th>\n",
276- " <td>2000020002 </td>\n",
277- " <td>32 .0</td>\n",
278- " <td>NaN </td>\n",
279- " <td>NaN </td>\n",
280- " <td>NaN </td>\n",
281- " <td>NaN </td>\n",
278+ " <th>26293 </th>\n",
279+ " <td>2019030415 </td>\n",
280+ " <td>12 .0</td>\n",
281+ " <td>20192020 </td>\n",
282+ " <td>P </td>\n",
283+ " <td>25.0 </td>\n",
284+ " <td>14.0 </td>\n",
282285 " </tr>\n",
283286 " <tr>\n",
284- " <th>2 </th>\n",
285- " <td>2000020003 </td>\n",
286- " <td>45 .0</td>\n",
287- " <td>NaN </td>\n",
288- " <td>NaN </td>\n",
289- " <td>NaN </td>\n",
290- " <td>NaN </td>\n",
287+ " <th>26294 </th>\n",
288+ " <td>2019030415 </td>\n",
289+ " <td>12 .0</td>\n",
290+ " <td>20192020 </td>\n",
291+ " <td>P </td>\n",
292+ " <td>25.0 </td>\n",
293+ " <td>14.0 </td>\n",
291294 " </tr>\n",
292295 " <tr>\n",
293- " <th>3 </th>\n",
294- " <td>2000020004 </td>\n",
296+ " <th>26295 </th>\n",
297+ " <td>2019030416 </td>\n",
295298 " <td>24.0</td>\n",
296- " <td>NaN </td>\n",
297- " <td>NaN </td>\n",
298- " <td>NaN </td>\n",
299- " <td>NaN </td>\n",
299+ " <td>20192020 </td>\n",
300+ " <td>P </td>\n",
301+ " <td>14.0 </td>\n",
302+ " <td>25.0 </td>\n",
300303 " </tr>\n",
301304 " <tr>\n",
302- " <th>4 </th>\n",
303- " <td>2000020005 </td>\n",
304- " <td>52 .0</td>\n",
305- " <td>NaN </td>\n",
306- " <td>NaN </td>\n",
307- " <td>NaN </td>\n",
308- " <td>NaN </td>\n",
305+ " <th>26296 </th>\n",
306+ " <td>2019030416 </td>\n",
307+ " <td>24 .0</td>\n",
308+ " <td>20192020 </td>\n",
309+ " <td>P </td>\n",
310+ " <td>14.0 </td>\n",
311+ " <td>25.0 </td>\n",
309312 " </tr>\n",
310313 " </tbody>\n",
311314 "</table>\n",
312315 "</div>"
313316 ],
314317 "text/plain": [
315- " game_id penaltyMinutes season type away_team_id home_team_id\n",
316- "0 2000020001 42 .0 NaN NaN NaN NaN \n",
317- "1 2000020002 32 .0 NaN NaN NaN NaN \n",
318- "2 2000020003 45 .0 NaN NaN NaN NaN \n",
319- "3 2000020004 24.0 NaN NaN NaN NaN \n",
320- "4 2000020005 52 .0 NaN NaN NaN NaN "
318+ " game_id penaltyMinutes season type away_team_id home_team_id\n",
319+ "26292 2019030414 36 .0 20192020 P 14.0 25.0 \n",
320+ "26293 2019030415 12 .0 20192020 P 25.0 14.0 \n",
321+ "26294 2019030415 12 .0 20192020 P 25.0 14.0 \n",
322+ "26295 2019030416 24.0 20192020 P 14.0 25.0 \n",
323+ "26296 2019030416 24 .0 20192020 P 14.0 25.0 "
321324 ]
322325 },
323- "execution_count": 3 ,
326+ "execution_count": 45 ,
324327 "metadata": {},
325328 "output_type": "execute_result"
326329 }
329332 "# Aggregate everything together\n",
330333 "df_detailed_pens = pd.merge(left=df_team_stats, right=df_games, how='left', left_on='game_id', right_on='game_id')\n",
331334 "\n",
332- "df_detailed_pens.head()"
335+ "# Drop NaN occurrences\n",
336+ "df_detailed_pens = df_detailed_pens.dropna()\n",
337+ "\n",
338+ "df_detailed_pens.tail()"
333339 ]
334340 },
335341 {
336342 "cell_type": "code",
337- "execution_count": 4 ,
343+ "execution_count": 41 ,
338344 "metadata": {},
339345 "outputs": [
340346 {
401407 "4 6 Bruins"
402408 ]
403409 },
404- "execution_count": 4 ,
410+ "execution_count": 41 ,
405411 "metadata": {},
406412 "output_type": "execute_result"
407413 }
414420 },
415421 {
416422 "cell_type": "code",
417- "execution_count": 5 ,
423+ "execution_count": 46 ,
418424 "metadata": {},
419425 "outputs": [
420426 {
505511 "4 2017021101 10.0 20172018 R Maple Leafs Canadiens"
506512 ]
507513 },
508- "execution_count": 5 ,
514+ "execution_count": 46 ,
509515 "metadata": {},
510516 "output_type": "execute_result"
511517 }
512518 ],
513519 "source": [
514520 "# Now get the home and away teams into the detailed penalties dataframe\n",
515521 "df_detailed_pens = pd.merge(left=df_detailed_pens, right=df_teams, how='inner', left_on='home_team_id', right_on='team_id')\n",
516- "df_detailed_pens = df_detailed_pens .rename(columns={'teamName': 'homeTeam'})\n",
517- "df_detailed_pens = df_detailed_pens .drop(['home_team_id','team_id'], axis=1)\n",
522+ "df_detailed_pens.rename(columns={'teamName': 'homeTeam'}, inplace=True )\n",
523+ "df_detailed_pens.drop(columns= ['home_team_id','team_id'], axis=1, inplace=True )\n",
518524 "\n",
519525 "df_detailed_pens = pd.merge(left=df_detailed_pens, right=df_teams, how='inner', left_on='away_team_id', right_on='team_id')\n",
520- "df_detailed_pens = df_detailed_pens .rename(columns={'teamName': 'awayTeam'})\n",
521- "df_detailed_pens = df_detailed_pens .drop(['away_team_id','team_id'], axis=1)\n",
526+ "df_detailed_pens.rename(columns={'teamName': 'awayTeam'}, inplace=True )\n",
527+ "df_detailed_pens.drop(columns= ['away_team_id','team_id'], axis=1, inplace=True )\n",
522528 "\n",
523529 "df_detailed_pens.head()"
524530 ]
533539 },
534540 {
535541 "cell_type": "code",
536- "execution_count": 6 ,
542+ "execution_count": 47 ,
537543 "metadata": {},
538544 "outputs": [
539545 {
550556 "Name: penaltyMinutes, dtype: float64"
551557 ]
552558 },
553- "execution_count": 6 ,
559+ "execution_count": 47 ,
554560 "metadata": {},
555561 "output_type": "execute_result"
556562 }
562568 },
563569 {
564570 "cell_type": "code",
565- "execution_count": 8 ,
571+ "execution_count": 48 ,
566572 "metadata": {},
567573 "outputs": [],
568574 "source": [
578584 },
579585 {
580586 "cell_type": "code",
581- "execution_count": 19 ,
587+ "execution_count": 49 ,
582588 "metadata": {},
583589 "outputs": [
584590 {
6557265578 },
6557365579 {
6557465580 "cell_type": "code",
65575- "execution_count": 13 ,
65581+ "execution_count": 51 ,
6557665582 "metadata": {},
6557765583 "outputs": [
6557865584 {
@@ -139811,7 +139817,7 @@
139811139817 },
139812139818 {
139813139819 "cell_type": "code",
139814- "execution_count": 27 ,
139820+ "execution_count": 52 ,
139815139821 "metadata": {},
139816139822 "outputs": [
139817139823 {
@@ -139890,7 +139896,7 @@
139890139896 "4 10.0 R Maple Leafs Canadiens"
139891139897 ]
139892139898 },
139893- "execution_count": 27 ,
139899+ "execution_count": 52 ,
139894139900 "metadata": {},
139895139901 "output_type": "execute_result"
139896139902 }
@@ -139903,7 +139909,7 @@
139903139909 },
139904139910 {
139905139911 "cell_type": "code",
139906- "execution_count": 28 ,
139912+ "execution_count": 53 ,
139907139913 "metadata": {},
139908139914 "outputs": [
139909139915 {
@@ -139920,7 +139926,6 @@
139920139926 "# Connect to the workspace by reading from the config.json file (downloaded from the Machine Learning page in the portal)\n",
139921139927 "from azureml.core import Workspace, Dataset\n",
139922139928 "\n",
139923- "\n",
139924139929 "ws = Workspace.from_config() # config.json is needed in this folder for this to work\n",
139925139930 "\n",
139926139931 "# Display high-level info on the workspace\n",
@@ -139931,7 +139936,7 @@
139931139936 },
139932139937 {
139933139938 "cell_type": "code",
139934- "execution_count": 39 ,
139939+ "execution_count": 54 ,
139935139940 "metadata": {},
139936139941 "outputs": [
139937139942 {
@@ -139941,11 +139946,11 @@
139941139946 "Validating arguments.\n",
139942139947 "Arguments validated.\n",
139943139948 "Successfully obtained datastore reference and path.\n",
139944- "Uploading file to managed-dataset/783d4c3f-e99e-47b9-ac1d-df55a2564c72 /\n",
139949+ "Uploading file to managed-dataset/edbbcf3b-8a61-49d3-b9a0-040a45e85c16 /\n",
139945139950 "Successfully uploaded file to datastore.\n",
139946139951 "Creating and registering a new dataset.\n",
139947139952 "Successfully created and registered a new dataset.\n",
139948- "NHL-Penalties-2020 v6 (ID: 69058fd8-167e-4e66-9aa8-733590aa2e4c )\n"
139953+ "NHL-Penalties-2020 v7 (ID: 8b25d268-7a7a-40d4-b496-4c016aeaaca0 )\n"
139949139954 ]
139950139955 }
139951139956 ],
0 commit comments