Skip to content

Commit 677e4e9

Browse files
committed
Added util function
1 parent fd76d19 commit 677e4e9

File tree

2 files changed

+162
-4
lines changed

2 files changed

+162
-4
lines changed

sklearn-example-2.ipynb

+108-4
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,120 @@
1616
},
1717
{
1818
"cell_type": "code",
19-
"execution_count": 43,
19+
"execution_count": 2,
2020
"metadata": {
2121
"collapsed": false,
2222
"deletable": true,
2323
"editable": true
2424
},
25-
"outputs": [],
25+
"outputs": [
26+
{
27+
"data": {
28+
"text/html": [
29+
"<div>\n",
30+
"<style scoped>\n",
31+
" .dataframe tbody tr th:only-of-type {\n",
32+
" vertical-align: middle;\n",
33+
" }\n",
34+
"\n",
35+
" .dataframe tbody tr th {\n",
36+
" vertical-align: top;\n",
37+
" }\n",
38+
"\n",
39+
" .dataframe thead th {\n",
40+
" text-align: right;\n",
41+
" }\n",
42+
"</style>\n",
43+
"<table border=\"1\" class=\"dataframe\">\n",
44+
" <thead>\n",
45+
" <tr style=\"text-align: right;\">\n",
46+
" <th></th>\n",
47+
" <th>Column</th>\n",
48+
" <th>Data Type</th>\n",
49+
" <th>Non-Null Count</th>\n",
50+
" <th>Null Count</th>\n",
51+
" <th>Unique Values</th>\n",
52+
" <th>Numeric Stats</th>\n",
53+
" </tr>\n",
54+
" </thead>\n",
55+
" <tbody>\n",
56+
" <tr>\n",
57+
" <th>0</th>\n",
58+
" <td>sepal_length</td>\n",
59+
" <td>float64</td>\n",
60+
" <td>150</td>\n",
61+
" <td>0</td>\n",
62+
" <td>35</td>\n",
63+
" <td>min: 4.30, max: 7.90, mean: 5.84, median: 5.80</td>\n",
64+
" </tr>\n",
65+
" <tr>\n",
66+
" <th>1</th>\n",
67+
" <td>sepal_width</td>\n",
68+
" <td>float64</td>\n",
69+
" <td>150</td>\n",
70+
" <td>0</td>\n",
71+
" <td>23</td>\n",
72+
" <td>min: 2.00, max: 4.40, mean: 3.06, median: 3.00</td>\n",
73+
" </tr>\n",
74+
" <tr>\n",
75+
" <th>2</th>\n",
76+
" <td>petal_length</td>\n",
77+
" <td>float64</td>\n",
78+
" <td>150</td>\n",
79+
" <td>0</td>\n",
80+
" <td>43</td>\n",
81+
" <td>min: 1.00, max: 6.90, mean: 3.76, median: 4.35</td>\n",
82+
" </tr>\n",
83+
" <tr>\n",
84+
" <th>3</th>\n",
85+
" <td>petal_width</td>\n",
86+
" <td>float64</td>\n",
87+
" <td>150</td>\n",
88+
" <td>0</td>\n",
89+
" <td>22</td>\n",
90+
" <td>min: 0.10, max: 2.50, mean: 1.20, median: 1.30</td>\n",
91+
" </tr>\n",
92+
" <tr>\n",
93+
" <th>4</th>\n",
94+
" <td>species</td>\n",
95+
" <td>object</td>\n",
96+
" <td>150</td>\n",
97+
" <td>0</td>\n",
98+
" <td>3</td>\n",
99+
" <td>N/A</td>\n",
100+
" </tr>\n",
101+
" </tbody>\n",
102+
"</table>\n",
103+
"</div>"
104+
],
105+
"text/plain": [
106+
" Column Data Type Non-Null Count Null Count Unique Values \\\n",
107+
"0 sepal_length float64 150 0 35 \n",
108+
"1 sepal_width float64 150 0 23 \n",
109+
"2 petal_length float64 150 0 43 \n",
110+
"3 petal_width float64 150 0 22 \n",
111+
"4 species object 150 0 3 \n",
112+
"\n",
113+
" Numeric Stats \n",
114+
"0 min: 4.30, max: 7.90, mean: 5.84, median: 5.80 \n",
115+
"1 min: 2.00, max: 4.40, mean: 3.06, median: 3.00 \n",
116+
"2 min: 1.00, max: 6.90, mean: 3.76, median: 4.35 \n",
117+
"3 min: 0.10, max: 2.50, mean: 1.20, median: 1.30 \n",
118+
"4 N/A "
119+
]
120+
},
121+
"execution_count": 2,
122+
"metadata": {},
123+
"output_type": "execute_result"
124+
}
125+
],
26126
"source": [
27127
"import seaborn as sns\n",
28-
"iris = sns.load_dataset('iris')"
128+
"from utils import summarize_dataframe\n",
129+
"\n",
130+
"iris = sns.load_dataset(\"iris\")\n",
131+
"\n",
132+
"summarize_dataframe(iris)"
29133
]
30134
},
31135
{
@@ -189,7 +293,7 @@
189293
"name": "python",
190294
"nbconvert_exporter": "python",
191295
"pygments_lexer": "ipython3",
192-
"version": "3.5.1"
296+
"version": "3.10.0"
193297
}
194298
},
195299
"nbformat": 4,

utils.py

+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import pandas as pd
2+
import numpy as np
3+
4+
5+
def summarize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
6+
"""
7+
Generates a summary of a pandas DataFrame.
8+
9+
This function provides a quick overview of the DataFrame, including:
10+
- Basic statistics for numeric columns
11+
- Unique value counts for categorical columns
12+
- Missing value counts for all columns
13+
14+
Parameters:
15+
df (pd.DataFrame): The input DataFrame to summarize
16+
17+
Returns:
18+
pd.DataFrame: A summary DataFrame with statistics for each column
19+
"""
20+
# Initialize lists to store summary information
21+
columns = []
22+
dtypes = []
23+
non_null_counts = []
24+
null_counts = []
25+
unique_counts = []
26+
numeric_stats = []
27+
28+
for col in df.columns:
29+
columns.append(col)
30+
dtypes.append(str(df[col].dtype))
31+
non_null_counts.append(df[col].count())
32+
null_counts.append(df[col].isnull().sum())
33+
unique_counts.append(df[col].nunique())
34+
35+
if np.issubdtype(df[col].dtype, np.number):
36+
numeric_stats.append(
37+
f"min: {df[col].min():.2f}, max: {df[col].max():.2f}, mean: {df[col].mean():.2f}, median: {df[col].median():.2f}"
38+
)
39+
else:
40+
numeric_stats.append("N/A")
41+
42+
# Create summary DataFrame
43+
summary_df = pd.DataFrame(
44+
{
45+
"Column": columns,
46+
"Data Type": dtypes,
47+
"Non-Null Count": non_null_counts,
48+
"Null Count": null_counts,
49+
"Unique Values": unique_counts,
50+
"Numeric Stats": numeric_stats,
51+
}
52+
)
53+
54+
return summary_df

0 commit comments

Comments
 (0)