Skip to content

feat: add the nycflights dataset #54

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Feb 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
336,777 changes: 336,777 additions & 0 deletions data_raw/nycflights.csv

Large diffs are not rendered by default.

Binary file added data_raw/nycflights.ddb
Binary file not shown.
8 changes: 8 additions & 0 deletions data_raw/x-02-duckdb.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ tbl_dates_times_text = pl.DataFrame(
)
small_table = pb.load_dataset(dataset="small_table", tbl_type="polars")
game_revenue = pb.load_dataset(dataset="game_revenue", tbl_type="polars")
nycflights = pb.load_dataset(dataset="nycflights", tbl_type="polars")
```


Expand Down Expand Up @@ -59,3 +60,10 @@ with duckdb.connect(database="game_revenue.ddb", read_only=False) as con:
CREATE TABLE IF NOT EXISTS 'game_revenue' AS SELECT * FROM game_revenue;
""")
```

```{python}
with duckdb.connect(database="nycflights.ddb", read_only=False) as con:
con.execute(f"""
CREATE TABLE IF NOT EXISTS 'nycflights' AS SELECT * FROM nycflights;
""")
```
Binary file added pointblank/data/nycflights-duckdb.zip
Binary file not shown.
Binary file added pointblank/data/nycflights.zip
Binary file not shown.
40 changes: 39 additions & 1 deletion pointblank/validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,8 @@ def load_dataset(
- `game_revenue`: A dataset with 2000 rows and 11 columns. Provides revenue data for a game
development company. For the particular game, there are records of player sessions, the items
they purchased, ads viewed, and the revenue generated.
- `nycflights`: A dataset with 336,776 rows and 18 columns. This dataset provides information
about flights departing from New York City airports (JFK, LGA, or EWR) in 2013.

Supported DataFrame Types
-------------------------
Expand Down Expand Up @@ -199,10 +201,25 @@ def load_dataset(

The `game_revenue` dataset is a more real-world dataset with a mix of data types, and it's
significantly larger than the `small_table` dataset at 2000 rows and 11 columns.

The `nycflights` dataset can be loaded as a DuckDB table by specifying the dataset name and
setting `tbl_type="duckdb"`:

```{python}
import pointblank as pb

nycflights = pb.load_dataset(dataset="nycflights", tbl_type="duckdb")

pb.preview(nycflights)
```

The `nycflights` dataset is a large dataset with 336,776 rows and 18 columns. This dataset is
truly a real-world dataset and provides information about flights originating from New York
City airports in 2013.
"""

# Raise an error if the dataset is from the list of provided datasets
if dataset not in ["small_table", "game_revenue"]:
if dataset not in ["small_table", "game_revenue", "nycflights"]:
raise ValueError(
f"The dataset name `{dataset}` is not valid. Choose one of the following:\n"
"- `small_table`\n"
Expand Down Expand Up @@ -245,6 +262,7 @@ def load_dataset(
parse_date_columns = {
"small_table": ["date_time", "date"],
"game_revenue": ["session_start", "time", "start_day"],
"nycflights": [],
}

dataset = pd.read_csv(data_path, parse_dates=parse_date_columns[dataset])
Expand Down Expand Up @@ -831,6 +849,26 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
rows, and so on. Any sectors that are light blue indicate that there are no missing values in
that sector. If there are missing values, the proportion of missing values is shown by a gray
color (light gray for low proportions, dark gray to black for very high proportions).

Examples
--------
The `missing_vals_tbl()` function is useful for quickly identifying columns with missing values
in a table. Here's an example using the `nycflights` dataset (loaded using the `load_dataset()`
function as a Polars DataFrame):

```{python}
import pointblank as pb

nycflights = pb.load_dataset("nycflights", tbl_type="polars")

pb.missing_vals_tbl(nycflights)
```

The table shows the proportion of missing values in each column of the `nycflights` dataset. The
table is divided into sectors, with each sector representing a range of rows in the table (with
around 34,000 rows per sector). The proportion of missing values in each sector is calculated
for each column. The various shades of gray indicate the proportion of missing values in each
sector. Many columns have no missing values at all, and those sectors are colored light blue.
"""

# Make a copy of the data to avoid modifying the original
Expand Down
24 changes: 20 additions & 4 deletions tests/test_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4481,23 +4481,26 @@ def test_load_dataset():

# Load the default dataset (`small_table`) and verify it's a Polars DataFrame
tbl = load_dataset()

assert isinstance(tbl, pl.DataFrame)

# Load the default dataset (`small_table`) and verify it's a Pandas DataFrame
tbl = load_dataset(tbl_type="pandas")

assert isinstance(tbl, pd.DataFrame)

# Load the `game_revenue` dataset and verify it's a Polars DataFrame
tbl = load_dataset(dataset="game_revenue")

assert isinstance(tbl, pl.DataFrame)

# Load the `game_revenue` dataset and verify it's a Pandas DataFrame

tbl = load_dataset(dataset="game_revenue", tbl_type="pandas")
assert isinstance(tbl, pd.DataFrame)

# Load the `nycflights` dataset and verify it's a Polars DataFrame
tbl = load_dataset(dataset="nycflights")
assert isinstance(tbl, pl.DataFrame)

# Load the `nycflights` dataset and verify it's a Pandas DataFrame
tbl = load_dataset(dataset="nycflights", tbl_type="pandas")
assert isinstance(tbl, pd.DataFrame)


Expand Down Expand Up @@ -4730,6 +4733,9 @@ def test_missing_vals_tbl_no_fail_pd_table():
game_revenue = load_dataset(dataset="game_revenue", tbl_type="pandas")
missing_vals_tbl(game_revenue)

nycflights = load_dataset(dataset="nycflights", tbl_type="pandas")
missing_vals_tbl(nycflights)


def test_missing_vals_tbl_no_fail_pl_table():

Expand All @@ -4739,6 +4745,9 @@ def test_missing_vals_tbl_no_fail_pl_table():
game_revenue = load_dataset(dataset="game_revenue", tbl_type="polars")
missing_vals_tbl(game_revenue)

nycflights = load_dataset(dataset="nycflights", tbl_type="polars")
missing_vals_tbl(nycflights)


def test_missing_vals_tbl_no_fail_duckdb_table():

Expand All @@ -4748,6 +4757,9 @@ def test_missing_vals_tbl_no_fail_duckdb_table():
game_revenue = load_dataset(dataset="game_revenue", tbl_type="duckdb")
missing_vals_tbl(game_revenue)

nycflights = load_dataset(dataset="nycflights", tbl_type="duckdb")
missing_vals_tbl(nycflights)


def test_missing_vals_tbl_no_pandas():

Expand Down Expand Up @@ -4794,9 +4806,11 @@ def test_get_column_count(tbl_type):

small_table = load_dataset(dataset="small_table", tbl_type=tbl_type)
game_revenue = load_dataset(dataset="game_revenue", tbl_type=tbl_type)
nycflights = load_dataset(dataset="nycflights", tbl_type=tbl_type)

assert get_column_count(small_table) == 8
assert get_column_count(game_revenue) == 11
assert get_column_count(nycflights) == 18


def test_get_column_count_failing():
Expand All @@ -4812,9 +4826,11 @@ def test_get_row_count(tbl_type):

small_table = load_dataset(dataset="small_table", tbl_type=tbl_type)
game_revenue = load_dataset(dataset="game_revenue", tbl_type=tbl_type)
nycflights = load_dataset(dataset="nycflights", tbl_type=tbl_type)

assert get_row_count(small_table) == 13
assert get_row_count(game_revenue) == 2000
assert get_row_count(nycflights) == 336776


def test_get_row_count_failing():
Expand Down