Added util function

smith-kyle · smith-kyle · commit 677e4e9471d3 · 2024-10-25T15:44:44.000-07:00
diff --git a/sklearn-example-2.ipynb b/sklearn-example-2.ipynb
@@ -16,16 +16,120 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 43,
+      "execution_count": 2,
       "metadata": {
         "collapsed": false,
         "deletable": true,
         "editable": true
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>Column</th>\n",
+              "      <th>Data Type</th>\n",
+              "      <th>Non-Null Count</th>\n",
+              "      <th>Null Count</th>\n",
+              "      <th>Unique Values</th>\n",
+              "      <th>Numeric Stats</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>sepal_length</td>\n",
+              "      <td>float64</td>\n",
+              "      <td>150</td>\n",
+              "      <td>0</td>\n",
+              "      <td>35</td>\n",
+              "      <td>min: 4.30, max: 7.90, mean: 5.84, median: 5.80</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>1</th>\n",
+              "      <td>sepal_width</td>\n",
+              "      <td>float64</td>\n",
+              "      <td>150</td>\n",
+              "      <td>0</td>\n",
+              "      <td>23</td>\n",
+              "      <td>min: 2.00, max: 4.40, mean: 3.06, median: 3.00</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>2</th>\n",
+              "      <td>petal_length</td>\n",
+              "      <td>float64</td>\n",
+              "      <td>150</td>\n",
+              "      <td>0</td>\n",
+              "      <td>43</td>\n",
+              "      <td>min: 1.00, max: 6.90, mean: 3.76, median: 4.35</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>3</th>\n",
+              "      <td>petal_width</td>\n",
+              "      <td>float64</td>\n",
+              "      <td>150</td>\n",
+              "      <td>0</td>\n",
+              "      <td>22</td>\n",
+              "      <td>min: 0.10, max: 2.50, mean: 1.20, median: 1.30</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>4</th>\n",
+              "      <td>species</td>\n",
+              "      <td>object</td>\n",
+              "      <td>150</td>\n",
+              "      <td>0</td>\n",
+              "      <td>3</td>\n",
+              "      <td>N/A</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>"
+            ],
+            "text/plain": [
+              "         Column Data Type  Non-Null Count  Null Count  Unique Values  \\\n",
+              "0  sepal_length   float64             150           0             35   \n",
+              "1   sepal_width   float64             150           0             23   \n",
+              "2  petal_length   float64             150           0             43   \n",
+              "3   petal_width   float64             150           0             22   \n",
+              "4       species    object             150           0              3   \n",
+              "\n",
+              "                                    Numeric Stats  \n",
+              "0  min: 4.30, max: 7.90, mean: 5.84, median: 5.80  \n",
+              "1  min: 2.00, max: 4.40, mean: 3.06, median: 3.00  \n",
+              "2  min: 1.00, max: 6.90, mean: 3.76, median: 4.35  \n",
+              "3  min: 0.10, max: 2.50, mean: 1.20, median: 1.30  \n",
+              "4                                             N/A  "
+            ]
+          },
+          "execution_count": 2,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
       "source": [
         "import seaborn as sns\n",
-        "iris = sns.load_dataset('iris')"
+        "from utils import summarize_dataframe\n",
+        "\n",
+        "iris = sns.load_dataset(\"iris\")\n",
+        "\n",
+        "summarize_dataframe(iris)"
       ]
     },
     {
@@ -189,7 +293,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.5.1"
+      "version": "3.10.0"
     }
   },
   "nbformat": 4,
diff --git a/utils.py b/utils.py
@@ -0,0 +1,54 @@
+import pandas as pd
+import numpy as np
+
+
+def summarize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Generates a summary of a pandas DataFrame.
+
+    This function provides a quick overview of the DataFrame, including:
+    - Basic statistics for numeric columns
+    - Unique value counts for categorical columns
+    - Missing value counts for all columns
+
+    Parameters:
+    df (pd.DataFrame): The input DataFrame to summarize
+
+    Returns:
+    pd.DataFrame: A summary DataFrame with statistics for each column
+    """
+    # Initialize lists to store summary information
+    columns = []
+    dtypes = []
+    non_null_counts = []
+    null_counts = []
+    unique_counts = []
+    numeric_stats = []
+
+    for col in df.columns:
+        columns.append(col)
+        dtypes.append(str(df[col].dtype))
+        non_null_counts.append(df[col].count())
+        null_counts.append(df[col].isnull().sum())
+        unique_counts.append(df[col].nunique())
+
+        if np.issubdtype(df[col].dtype, np.number):
+            numeric_stats.append(
+                f"min: {df[col].min():.2f}, max: {df[col].max():.2f}, mean: {df[col].mean():.2f}, median: {df[col].median():.2f}"
+            )
+        else:
+            numeric_stats.append("N/A")
+
+    # Create summary DataFrame
+    summary_df = pd.DataFrame(
+        {
+            "Column": columns,
+            "Data Type": dtypes,
+            "Non-Null Count": non_null_counts,
+            "Null Count": null_counts,
+            "Unique Values": unique_counts,
+            "Numeric Stats": numeric_stats,
+        }
+    )
+
+    return summary_df