Skip to content

Commit 6d05592

Browse files
committed
Py Tip: Siuba
1 parent 8469285 commit 6d05592

File tree

3 files changed

+151
-1
lines changed

3 files changed

+151
-1
lines changed

.vscode/settings.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"python.pythonPath": "C:\\Users\\mdanc\\Anaconda3\\envs\\ds4b_101p\\python.exe",
2+
"python.pythonPath": "/Users/mdancho/opt/anaconda3/envs/free_python_tips/bin/python",
33
"jupyter.sendSelectionToInteractiveWindow": true,
44
"jupyter.notebookFileRoot": "${workspaceFolder}",
55
}

02_siuba/02_siuba.py

+60
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# PYTHON TIPS ----
2+
# TIP 002 | Siuba: Dplyr for Python ----
3+
#
4+
# 👉 For Weekly Python-Tips, Sign Up Here:
5+
# https://mailchi.mp/business-science/python_tips_newsletter
6+
7+
# LIBRARIES ----
8+
import numpy as np
9+
import pandas as pd
10+
11+
from siuba import _
12+
from siuba.dply.verbs import group_by, mutate, select, summarize, ungroup
13+
14+
# DATASET ----
15+
16+
mpg_df = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/mpg.csv")
17+
mpg_df
18+
19+
# 1.0 GROUP BY + SUMMARIZE
20+
# Goal: Mean and Standard Deviation of weight by engine size
21+
22+
weight_by_cyl_df = mpg_df >> \
23+
group_by("cylinders") >> \
24+
summarize(
25+
mean_weight = np.mean(_.weight),
26+
sd_weight = np.std(_.weight)
27+
)
28+
29+
weight_by_cyl_df
30+
31+
# 2.0 GROUP BY + MUTATE
32+
# Goal: De-mean the mpg by average of each cylinder
33+
34+
mpg_demeaned_by_cyl_df = mpg_df >> \
35+
select('name', 'cylinders', 'mpg') >> \
36+
group_by("cylinders") >> \
37+
mutate(
38+
mean_mpg = np.mean(_.mpg)
39+
) >> \
40+
ungroup() >> \
41+
mutate(
42+
mpg_demeaned_by_cyl = _.mpg - _.mean_mpg
43+
)
44+
45+
mpg_demeaned_by_cyl_df
46+
47+
# 3.0 PANDAS
48+
mpg_demeaned_by_cyl_df[['name', 'cylinders', 'mpg_demeaned_by_cyl']] \
49+
.sort_values('mpg_demeaned_by_cyl', ascending = False) \
50+
.style \
51+
.background_gradient()
52+
53+
# LEARNING PANDAS ----
54+
# - Siuba is great for when you are coming from R to Python (like me)
55+
# - Teams use Pandas: 99% of data wranlging code is written with Pandas
56+
# - Better Learn Pandas if you want to be part of the Team
57+
58+
# I TEACH PANDAS (FROM AN R-USERS PERSPECTIVE)!
59+
# Python for Data Science Automation Course (Contains 5 hours of Pandas)
60+
# https://university.business-science.io/p/python-for-data-science-automation-ds4b-101p

environment.yml

+90
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
# In terminal, run: conda env create -f environment.yml
2+
# To update, run: conda env update -f environment.yml
3+
name: free_python_tips
4+
channels:
5+
- anaconda
6+
- conda-forge
7+
- defaults
8+
dependencies:
9+
- python=3.7.1
10+
- pip
11+
- pip:
12+
# Core Data
13+
- numpy==1.20.2
14+
- pandas==1.2.2
15+
16+
# R Data
17+
- plydata==0.4.3
18+
- siuba==0.0.24
19+
- datatable
20+
21+
# Visualization
22+
- matplotlib==3.3.4
23+
- plotnine==0.7.1
24+
- mizani==0.7.2
25+
- plotly==4.14.3
26+
- altair==4.1.0
27+
28+
# EDA
29+
- pandas-profiling
30+
- ppscore==1.2.0
31+
- pyjanitor==0.20.14
32+
33+
# Modeling & Machine Learning
34+
- statsmodels
35+
- nltk==3.5
36+
- h2o==3.32.0.3
37+
- pycaret==2.3.0
38+
- scikit-learn==0.23.2
39+
- xgboost==0.90
40+
- lightgbm==3.1.1
41+
- catboost==0.24.4
42+
- sklearn-pandas==2.0.4
43+
- scikit-misc==0.1.3
44+
45+
# Time Series
46+
- sktime==0.5.3
47+
- pmdarima==1.8.1
48+
- tsfresh==0.17.0
49+
50+
# Scalability & Automation
51+
- dask==2.30.0
52+
- dask-ml==1.8.0
53+
- dask-xgboost==0.1.11
54+
- zict==1.0.0
55+
- joblib==1.0.1
56+
57+
# API
58+
- fastapi==0.63.0
59+
- uvicorn==0.13.4
60+
61+
# Database
62+
- sqlalchemy==1.4.7
63+
64+
# Excel
65+
- xlsxwriter==1.3.7
66+
- openpyxl
67+
68+
# Jupyter
69+
- jupyterlab==3.0.13
70+
- jupyterlab-server==2.4.0
71+
- jupyter-packaging==0.7.12
72+
- jupyter-server==1.6.1 # Solves ImportError: cannot import name 'get_version_info' from 'jupyter_packaging'
73+
- ipywidgets==7.6.3
74+
- ipympl==0.7.0
75+
- jupytext
76+
- papermill==2.3.3
77+
- nbconvert==5.6.1
78+
79+
# Apps
80+
- streamlit==0.80.0
81+
82+
# Terminal Formatting
83+
- rich
84+
85+
# Extending Pandas
86+
- pandas_flavor
87+
88+
# R users
89+
- radian
90+
- jedi==0.17.2

0 commit comments

Comments
 (0)