posit-dev · rich-iannone · Feb 11, 2025 · Feb 11, 2025 · Feb 11, 2025 · Feb 11, 2025
diff --git a/data_raw/nycflights.csv b/data_raw/nycflights.csv
diff --git a/data_raw/nycflights.ddb b/data_raw/nycflights.ddb
diff --git a/data_raw/x-02-duckdb.qmd b/data_raw/x-02-duckdb.qmd
@@ -21,6 +21,7 @@ tbl_dates_times_text = pl.DataFrame(
 )
 small_table = pb.load_dataset(dataset="small_table", tbl_type="polars")
 game_revenue = pb.load_dataset(dataset="game_revenue", tbl_type="polars")
+nycflights = pb.load_dataset(dataset="nycflights", tbl_type="polars")
 ```
 
 
@@ -59,3 +60,10 @@ with duckdb.connect(database="game_revenue.ddb", read_only=False) as con:
         CREATE TABLE IF NOT EXISTS 'game_revenue' AS SELECT * FROM game_revenue;
     """)
 ```
+
+```{python}
+with duckdb.connect(database="nycflights.ddb", read_only=False) as con:
+    con.execute(f"""
+        CREATE TABLE IF NOT EXISTS 'nycflights' AS SELECT * FROM nycflights;
+    """)
+```
diff --git a/pointblank/data/nycflights-duckdb.zip b/pointblank/data/nycflights-duckdb.zip
diff --git a/pointblank/data/nycflights.zip b/pointblank/data/nycflights.zip
diff --git a/pointblank/validate.py b/pointblank/validate.py
@@ -161,6 +161,8 @@ def load_dataset(
     - `game_revenue`: A dataset with 2000 rows and 11 columns. Provides revenue data for a game
     development company. For the particular game, there are records of player sessions, the items
     they purchased, ads viewed, and the revenue generated.
+    - `nycflights`: A dataset with 336,776 rows and 18 columns. This dataset provides information
+    about flights departing from New York City airports (JFK, LGA, or EWR) in 2013.
 
     Supported DataFrame Types
     -------------------------
@@ -199,10 +201,25 @@ def load_dataset(
 
     The `game_revenue` dataset is a more real-world dataset with a mix of data types, and it's
     significantly larger than the `small_table` dataset at 2000 rows and 11 columns.
+
+    The `nycflights` dataset can be loaded as a DuckDB table by specifying the dataset name and
+    setting `tbl_type="duckdb"`:
+
+    ```{python}
+    import pointblank as pb
+
+    nycflights = pb.load_dataset(dataset="nycflights", tbl_type="duckdb")
+
+    pb.preview(nycflights)
+    ```
+
+    The `nycflights` dataset is a large dataset with 336,776 rows and 18 columns. This dataset is
+    truly a real-world dataset and provides information about flights originating from New York
+    City airports in 2013.
     """
 
     # Raise an error if the dataset is from the list of provided datasets
-    if dataset not in ["small_table", "game_revenue"]:
+    if dataset not in ["small_table", "game_revenue", "nycflights"]:
         raise ValueError(
             f"The dataset name `{dataset}` is not valid. Choose one of the following:\n"
             "- `small_table`\n"
@@ -245,6 +262,7 @@ def load_dataset(
         parse_date_columns = {
             "small_table": ["date_time", "date"],
             "game_revenue": ["session_start", "time", "start_day"],
+            "nycflights": [],
         }
 
         dataset = pd.read_csv(data_path, parse_dates=parse_date_columns[dataset])
@@ -831,6 +849,26 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
     rows, and so on. Any sectors that are light blue indicate that there are no missing values in
     that sector. If there are missing values, the proportion of missing values is shown by a gray
     color (light gray for low proportions, dark gray to black for very high proportions).
+
+    Examples
+    --------
+    The `missing_vals_tbl()` function is useful for quickly identifying columns with missing values
+    in a table. Here's an example using the `nycflights` dataset (loaded using the `load_dataset()`
+    function as a Polars DataFrame):
+
+    ```{python}
+    import pointblank as pb
+
+    nycflights = pb.load_dataset("nycflights", tbl_type="polars")
+
+    pb.missing_vals_tbl(nycflights)
+    ```
+
+    The table shows the proportion of missing values in each column of the `nycflights` dataset. The
+    table is divided into sectors, with each sector representing a range of rows in the table (with
+    around 34,000 rows per sector). The proportion of missing values in each sector is calculated
+    for each column. The various shades of gray indicate the proportion of missing values in each
+    sector. Many columns have no missing values at all, and those sectors are colored light blue.
     """
 
     # Make a copy of the data to avoid modifying the original

diff --git a/tests/test_validate.py b/tests/test_validate.py
@@ -4481,23 +4481,26 @@ def test_load_dataset():
 
     # Load the default dataset (`small_table`) and verify it's a Polars DataFrame
     tbl = load_dataset()
-
     assert isinstance(tbl, pl.DataFrame)
 
     # Load the default dataset (`small_table`) and verify it's a Pandas DataFrame
     tbl = load_dataset(tbl_type="pandas")
-
     assert isinstance(tbl, pd.DataFrame)
 
     # Load the `game_revenue` dataset and verify it's a Polars DataFrame
     tbl = load_dataset(dataset="game_revenue")
-
     assert isinstance(tbl, pl.DataFrame)
 
     # Load the `game_revenue` dataset and verify it's a Pandas DataFrame
-
     tbl = load_dataset(dataset="game_revenue", tbl_type="pandas")
+    assert isinstance(tbl, pd.DataFrame)
 
+    # Load the `nycflights` dataset and verify it's a Polars DataFrame
+    tbl = load_dataset(dataset="nycflights")
+    assert isinstance(tbl, pl.DataFrame)
+
+    # Load the `nycflights` dataset and verify it's a Pandas DataFrame
+    tbl = load_dataset(dataset="nycflights", tbl_type="pandas")
     assert isinstance(tbl, pd.DataFrame)
 
 
@@ -4730,6 +4733,9 @@ def test_missing_vals_tbl_no_fail_pd_table():
     game_revenue = load_dataset(dataset="game_revenue", tbl_type="pandas")
     missing_vals_tbl(game_revenue)
 
+    nycflights = load_dataset(dataset="nycflights", tbl_type="pandas")
+    missing_vals_tbl(nycflights)
+
 
 def test_missing_vals_tbl_no_fail_pl_table():
 
@@ -4739,6 +4745,9 @@ def test_missing_vals_tbl_no_fail_pl_table():
     game_revenue = load_dataset(dataset="game_revenue", tbl_type="polars")
     missing_vals_tbl(game_revenue)
 
+    nycflights = load_dataset(dataset="nycflights", tbl_type="polars")
+    missing_vals_tbl(nycflights)
+
 
 def test_missing_vals_tbl_no_fail_duckdb_table():
 
@@ -4748,6 +4757,9 @@ def test_missing_vals_tbl_no_fail_duckdb_table():
     game_revenue = load_dataset(dataset="game_revenue", tbl_type="duckdb")
     missing_vals_tbl(game_revenue)
 
+    nycflights = load_dataset(dataset="nycflights", tbl_type="duckdb")
+    missing_vals_tbl(nycflights)
+
 
 def test_missing_vals_tbl_no_pandas():
 
@@ -4794,9 +4806,11 @@ def test_get_column_count(tbl_type):
 
     small_table = load_dataset(dataset="small_table", tbl_type=tbl_type)
     game_revenue = load_dataset(dataset="game_revenue", tbl_type=tbl_type)
+    nycflights = load_dataset(dataset="nycflights", tbl_type=tbl_type)
 
     assert get_column_count(small_table) == 8
     assert get_column_count(game_revenue) == 11
+    assert get_column_count(nycflights) == 18
 
 
 def test_get_column_count_failing():
@@ -4812,9 +4826,11 @@ def test_get_row_count(tbl_type):
 
     small_table = load_dataset(dataset="small_table", tbl_type=tbl_type)
     game_revenue = load_dataset(dataset="game_revenue", tbl_type=tbl_type)
+    nycflights = load_dataset(dataset="nycflights", tbl_type=tbl_type)
 
     assert get_row_count(small_table) == 13
     assert get_row_count(game_revenue) == 2000
+    assert get_row_count(nycflights) == 336776
 
 
 def test_get_row_count_failing():