Skip to content

Commit 2354f0f

Browse files
authored
Merge pull request #54 from posit-dev/feat-add-nycflights-dataset
feat: add the `nycflights` dataset
2 parents 03502b1 + ca1a414 commit 2354f0f

File tree

7 files changed

+336844
-5
lines changed

7 files changed

+336844
-5
lines changed

data_raw/nycflights.csv

+336,777
Large diffs are not rendered by default.

data_raw/nycflights.ddb

7.01 MB
Binary file not shown.

data_raw/x-02-duckdb.qmd

+8
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ tbl_dates_times_text = pl.DataFrame(
2121
)
2222
small_table = pb.load_dataset(dataset="small_table", tbl_type="polars")
2323
game_revenue = pb.load_dataset(dataset="game_revenue", tbl_type="polars")
24+
nycflights = pb.load_dataset(dataset="nycflights", tbl_type="polars")
2425
```
2526

2627

@@ -59,3 +60,10 @@ with duckdb.connect(database="game_revenue.ddb", read_only=False) as con:
5960
CREATE TABLE IF NOT EXISTS 'game_revenue' AS SELECT * FROM game_revenue;
6061
""")
6162
```
63+
64+
```{python}
65+
with duckdb.connect(database="nycflights.ddb", read_only=False) as con:
66+
con.execute(f"""
67+
CREATE TABLE IF NOT EXISTS 'nycflights' AS SELECT * FROM nycflights;
68+
""")
69+
```

pointblank/data/nycflights-duckdb.zip

5.05 MB
Binary file not shown.

pointblank/data/nycflights.zip

7.47 MB
Binary file not shown.

pointblank/validate.py

+39-1
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,8 @@ def load_dataset(
161161
- `game_revenue`: A dataset with 2000 rows and 11 columns. Provides revenue data for a game
162162
development company. For the particular game, there are records of player sessions, the items
163163
they purchased, ads viewed, and the revenue generated.
164+
- `nycflights`: A dataset with 336,776 rows and 18 columns. This dataset provides information
165+
about flights departing from New York City airports (JFK, LGA, or EWR) in 2013.
164166
165167
Supported DataFrame Types
166168
-------------------------
@@ -199,10 +201,25 @@ def load_dataset(
199201
200202
The `game_revenue` dataset is a more real-world dataset with a mix of data types, and it's
201203
significantly larger than the `small_table` dataset at 2000 rows and 11 columns.
204+
205+
The `nycflights` dataset can be loaded as a DuckDB table by specifying the dataset name and
206+
setting `tbl_type="duckdb"`:
207+
208+
```{python}
209+
import pointblank as pb
210+
211+
nycflights = pb.load_dataset(dataset="nycflights", tbl_type="duckdb")
212+
213+
pb.preview(nycflights)
214+
```
215+
216+
The `nycflights` dataset is a large dataset with 336,776 rows and 18 columns. This dataset is
217+
truly a real-world dataset and provides information about flights originating from New York
218+
City airports in 2013.
202219
"""
203220

204221
# Raise an error if the dataset is from the list of provided datasets
205-
if dataset not in ["small_table", "game_revenue"]:
222+
if dataset not in ["small_table", "game_revenue", "nycflights"]:
206223
raise ValueError(
207224
f"The dataset name `{dataset}` is not valid. Choose one of the following:\n"
208225
"- `small_table`\n"
@@ -245,6 +262,7 @@ def load_dataset(
245262
parse_date_columns = {
246263
"small_table": ["date_time", "date"],
247264
"game_revenue": ["session_start", "time", "start_day"],
265+
"nycflights": [],
248266
}
249267

250268
dataset = pd.read_csv(data_path, parse_dates=parse_date_columns[dataset])
@@ -831,6 +849,26 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
831849
rows, and so on. Any sectors that are light blue indicate that there are no missing values in
832850
that sector. If there are missing values, the proportion of missing values is shown by a gray
833851
color (light gray for low proportions, dark gray to black for very high proportions).
852+
853+
Examples
854+
--------
855+
The `missing_vals_tbl()` function is useful for quickly identifying columns with missing values
856+
in a table. Here's an example using the `nycflights` dataset (loaded using the `load_dataset()`
857+
function as a Polars DataFrame):
858+
859+
```{python}
860+
import pointblank as pb
861+
862+
nycflights = pb.load_dataset("nycflights", tbl_type="polars")
863+
864+
pb.missing_vals_tbl(nycflights)
865+
```
866+
867+
The table shows the proportion of missing values in each column of the `nycflights` dataset. The
868+
table is divided into sectors, with each sector representing a range of rows in the table (with
869+
around 34,000 rows per sector). The proportion of missing values in each sector is calculated
870+
for each column. The various shades of gray indicate the proportion of missing values in each
871+
sector. Many columns have no missing values at all, and those sectors are colored light blue.
834872
"""
835873

836874
# Make a copy of the data to avoid modifying the original

tests/test_validate.py

+20-4
Original file line numberDiff line numberDiff line change
@@ -4481,23 +4481,26 @@ def test_load_dataset():
44814481

44824482
# Load the default dataset (`small_table`) and verify it's a Polars DataFrame
44834483
tbl = load_dataset()
4484-
44854484
assert isinstance(tbl, pl.DataFrame)
44864485

44874486
# Load the default dataset (`small_table`) and verify it's a Pandas DataFrame
44884487
tbl = load_dataset(tbl_type="pandas")
4489-
44904488
assert isinstance(tbl, pd.DataFrame)
44914489

44924490
# Load the `game_revenue` dataset and verify it's a Polars DataFrame
44934491
tbl = load_dataset(dataset="game_revenue")
4494-
44954492
assert isinstance(tbl, pl.DataFrame)
44964493

44974494
# Load the `game_revenue` dataset and verify it's a Pandas DataFrame
4498-
44994495
tbl = load_dataset(dataset="game_revenue", tbl_type="pandas")
4496+
assert isinstance(tbl, pd.DataFrame)
45004497

4498+
# Load the `nycflights` dataset and verify it's a Polars DataFrame
4499+
tbl = load_dataset(dataset="nycflights")
4500+
assert isinstance(tbl, pl.DataFrame)
4501+
4502+
# Load the `nycflights` dataset and verify it's a Pandas DataFrame
4503+
tbl = load_dataset(dataset="nycflights", tbl_type="pandas")
45014504
assert isinstance(tbl, pd.DataFrame)
45024505

45034506

@@ -4730,6 +4733,9 @@ def test_missing_vals_tbl_no_fail_pd_table():
47304733
game_revenue = load_dataset(dataset="game_revenue", tbl_type="pandas")
47314734
missing_vals_tbl(game_revenue)
47324735

4736+
nycflights = load_dataset(dataset="nycflights", tbl_type="pandas")
4737+
missing_vals_tbl(nycflights)
4738+
47334739

47344740
def test_missing_vals_tbl_no_fail_pl_table():
47354741

@@ -4739,6 +4745,9 @@ def test_missing_vals_tbl_no_fail_pl_table():
47394745
game_revenue = load_dataset(dataset="game_revenue", tbl_type="polars")
47404746
missing_vals_tbl(game_revenue)
47414747

4748+
nycflights = load_dataset(dataset="nycflights", tbl_type="polars")
4749+
missing_vals_tbl(nycflights)
4750+
47424751

47434752
def test_missing_vals_tbl_no_fail_duckdb_table():
47444753

@@ -4748,6 +4757,9 @@ def test_missing_vals_tbl_no_fail_duckdb_table():
47484757
game_revenue = load_dataset(dataset="game_revenue", tbl_type="duckdb")
47494758
missing_vals_tbl(game_revenue)
47504759

4760+
nycflights = load_dataset(dataset="nycflights", tbl_type="duckdb")
4761+
missing_vals_tbl(nycflights)
4762+
47514763

47524764
def test_missing_vals_tbl_no_pandas():
47534765

@@ -4794,9 +4806,11 @@ def test_get_column_count(tbl_type):
47944806

47954807
small_table = load_dataset(dataset="small_table", tbl_type=tbl_type)
47964808
game_revenue = load_dataset(dataset="game_revenue", tbl_type=tbl_type)
4809+
nycflights = load_dataset(dataset="nycflights", tbl_type=tbl_type)
47974810

47984811
assert get_column_count(small_table) == 8
47994812
assert get_column_count(game_revenue) == 11
4813+
assert get_column_count(nycflights) == 18
48004814

48014815

48024816
def test_get_column_count_failing():
@@ -4812,9 +4826,11 @@ def test_get_row_count(tbl_type):
48124826

48134827
small_table = load_dataset(dataset="small_table", tbl_type=tbl_type)
48144828
game_revenue = load_dataset(dataset="game_revenue", tbl_type=tbl_type)
4829+
nycflights = load_dataset(dataset="nycflights", tbl_type=tbl_type)
48154830

48164831
assert get_row_count(small_table) == 13
48174832
assert get_row_count(game_revenue) == 2000
4833+
assert get_row_count(nycflights) == 336776
48184834

48194835

48204836
def test_get_row_count_failing():

0 commit comments

Comments
 (0)