Skip to content

Commit 778ff82

Browse files
committed
Add WIP missing_vals_tbl() function
1 parent d1022c6 commit 778ff82

File tree

2 files changed

+230
-1
lines changed

2 files changed

+230
-1
lines changed

pointblank/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
load_dataset,
2626
config,
2727
preview,
28+
missing_vals_tbl,
2829
get_column_count,
2930
get_row_count,
3031
)
@@ -47,6 +48,7 @@
4748
"load_dataset",
4849
"config",
4950
"preview",
51+
"missing_vals_tbl",
5052
"get_column_count",
5153
"get_row_count",
5254
]

pointblank/validate.py

Lines changed: 228 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,15 @@
6969
)
7070
from pointblank._utils_html import _create_table_type_html, _create_table_dims_html
7171

72-
__all__ = ["Validate", "load_dataset", "config", "preview", "get_column_count", "get_row_count"]
72+
__all__ = [
73+
"Validate",
74+
"load_dataset",
75+
"config",
76+
"preview",
77+
"missing_vals_tbl",
78+
"get_column_count",
79+
"get_row_count",
80+
]
7381

7482

7583
@dataclass
@@ -772,6 +780,225 @@ def _generate_display_table(
772780
return gt_tbl
773781

774782

783+
def missing_vals_tbl(data: FrameT | Any) -> GT:
784+
"""
785+
Display a table that shows the missing values in the input table.
786+
787+
The `missing_vals_tbl()` function generates a table that shows the missing values in the input
788+
table. The table is displayed using the Great Tables (`GT`) API, which allows for further
789+
customization of the table's appearance if so desired.
790+
791+
Parameters
792+
----------
793+
data
794+
The table for which to display the missing values. This could be a DataFrame object or an
795+
Ibis table object. Read the *Supported Input Table Types* section for details on the
796+
supported table types.
797+
798+
Returns
799+
-------
800+
GT
801+
A GT object that displays the table of missing values in the input table.
802+
803+
Supported Input Table Types
804+
---------------------------
805+
The `data=` parameter can be given any of the following table types:
806+
807+
- Polars DataFrame (`"polars"`)
808+
- Pandas DataFrame (`"pandas"`)
809+
- DuckDB table (`"duckdb"`)*
810+
- MySQL table (`"mysql"`)*
811+
- PostgreSQL table (`"postgresql"`)*
812+
- SQLite table (`"sqlite"`)*
813+
- Parquet table (`"parquet"`)*
814+
815+
The table types marked with an asterisk need to be prepared as Ibis tables (with type of
816+
`ibis.expr.types.relations.Table`). Furthermore, using `missing_vals_tbl()` with these types of
817+
tables requires the Ibis library (`v9.5.0` or above) to be installed. If the input table is a
818+
Polars or Pandas DataFrame, the availability of Ibis is not needed.
819+
"""
820+
821+
# Make a copy of the data to avoid modifying the original
822+
data = copy.deepcopy(data)
823+
824+
# Get the number of rows in the table
825+
n_rows = get_row_count(data)
826+
827+
# Determine if the table is a DataFrame or an Ibis table
828+
tbl_type = _get_tbl_type(data=data)
829+
ibis_tbl = "ibis.expr.types.relations.Table" in str(type(data))
830+
pl_pb_tbl = "polars" in tbl_type or "pandas" in tbl_type
831+
832+
# Select the DataFrame library to use for displaying the Ibis table
833+
df_lib_gt = _select_df_lib(preference="polars")
834+
df_lib_name_gt = df_lib_gt.__name__
835+
836+
# If the table is a DataFrame (Pandas or Polars), set `df_lib_name_gt` to the name of the
837+
# library (e.g., "polars" or "pandas")
838+
if pl_pb_tbl:
839+
df_lib_name_gt = "polars" if "polars" in tbl_type else "pandas"
840+
841+
# Handle imports of Polars or Pandas here
842+
if df_lib_name_gt == "polars":
843+
import polars as pl
844+
else:
845+
import pandas as pd
846+
847+
# From an Ibis table:
848+
# - get the row count
849+
# - get 10 cut points for table preview, these are row numbers used as buckets for determining
850+
# the proportion of missing values in each 'sector' in each column
851+
if ibis_tbl:
852+
853+
# Get the row count for the table
854+
ibis_rows = data.count()
855+
n_rows = ibis_rows.to_polars() if df_lib_name_gt == "polars" else int(ibis_rows.to_pandas())
856+
857+
# Get the column names from the table
858+
col_names = list(data.columns)
859+
860+
n_cut_points = 11
861+
862+
# Get the cut points for the table preview
863+
cut_points = _get_cut_points(n_rows=n_rows, n_cuts=n_cut_points)
864+
865+
# Iterate over the cut points and get the proportion of missing values in each 'sector'
866+
# for each column
867+
missing_vals = {
868+
col: [
869+
(
870+
data[(cut_points[i] - 1) : cut_points[i]][col].isnull().sum().to_polars()
871+
/ (cut_points[i] - (cut_points[i - 1] if i > 0 else 0))
872+
if cut_points[i] > (cut_points[i - 1] if i > 0 else 0)
873+
else 0
874+
)
875+
for i in range(len(cut_points))
876+
]
877+
for col in data.columns
878+
}
879+
880+
# Get a dictionary of counts of missing values in each column
881+
missing_val_counts = {col: data[col].isnull().sum().to_polars() for col in data.columns}
882+
883+
missing_vals = {
884+
"columns": list(missing_vals.keys()),
885+
**{
886+
str(i + 1): [missing_vals[col][i] for col in missing_vals.keys()] for i in range(10)
887+
},
888+
}
889+
890+
# From `missing_vals`, create a DataFrame with the missing value proportions
891+
if df_lib_name_gt == "polars":
892+
893+
import polars as pl
894+
895+
# Create a Polars DataFrame from the `missing_vals` dictionary
896+
missing_vals_df = pl.DataFrame(missing_vals)
897+
898+
# Get a count of total missing values
899+
n_missing_total = sum(missing_val_counts.values())
900+
901+
# Create the label, table type, and thresholds HTML fragments
902+
table_type_html = _create_table_type_html(tbl_type=tbl_type, tbl_name=None, font_size="10px")
903+
904+
tbl_dims_html = _create_table_dims_html(columns=len(col_names), rows=n_rows, font_size="10px")
905+
906+
check_mark = '<span style="color:#4CA64C;">&check;</span>'
907+
908+
# Compose the title HTML fragment
909+
if n_missing_total == 0:
910+
combined_title = f"Missing Values {check_mark}"
911+
else:
912+
combined_title = (
913+
"Missing Values&nbsp;&nbsp;&nbsp;<span style='font-size: 14px; "
914+
f"text-transform: uppercase; color: #333333'>{n_missing_total} in total</span>"
915+
)
916+
917+
# Compose the subtitle HTML fragment
918+
combined_subtitle = (
919+
"<div>"
920+
'<div style="padding-top: 0; padding-bottom: 7px;">'
921+
f"{table_type_html}"
922+
f"{tbl_dims_html}"
923+
"</div>"
924+
"</div>"
925+
)
926+
927+
import polars.selectors as cs
928+
929+
missing_vals_tbl = (
930+
GT(missing_vals_df)
931+
.tab_header(title=html(combined_title), subtitle=html(combined_subtitle))
932+
.opt_table_font(font=google_font(name="IBM Plex Sans"))
933+
.opt_align_table_header(align="left")
934+
.cols_label(columns="Column")
935+
.cols_width(
936+
cases={
937+
"columns": "200px",
938+
"1": "30px",
939+
"2": "30px",
940+
"3": "30px",
941+
"4": "30px",
942+
"5": "30px",
943+
"6": "30px",
944+
"7": "30px",
945+
"8": "30px",
946+
"9": "30px",
947+
"10": "30px",
948+
}
949+
)
950+
.cols_align(align="center", columns=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"])
951+
.data_color(
952+
columns=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
953+
palette=["#F5F5F5", "#000000"],
954+
domain=[0, 1],
955+
)
956+
.tab_style(
957+
style=style.borders(
958+
sides=["left", "right"], color="#F0F0F0", style="solid", weight="1px"
959+
),
960+
locations=loc.body(columns=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]),
961+
)
962+
.tab_style(
963+
style=style.css(
964+
"height: 20px; padding: 4px; white-space: nowrap; text-overflow: "
965+
"ellipsis; overflow: hidden;"
966+
),
967+
locations=loc.body(),
968+
)
969+
.tab_style(
970+
style=style.text(color="black", font=google_font(name="IBM Plex Mono"), size="12px"),
971+
locations=loc.body(),
972+
)
973+
.tab_style(
974+
style=style.text(color="black", size="16px"),
975+
locations=loc.column_labels(),
976+
)
977+
.fmt(fns=lambda x: "", columns=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"])
978+
.tab_style(style=style.fill(color="lightblue"), locations=loc.body(mask=cs.numeric().eq(0)))
979+
)
980+
981+
return missing_vals_tbl
982+
983+
984+
def _get_cut_points(n_rows: int, n_cuts: int) -> list[int]:
985+
"""
986+
Get the cut points for a table preview.
987+
988+
For a certain number of rows and a certain number of cuts, get the cut points, which are integer
989+
values that divide the rows into equal parts (all parts don't have to be equal, but the cut
990+
points should be as close to equal as possible and add to the total number of rows).
991+
"""
992+
993+
# Get the number of rows in each cut
994+
cut_size = n_rows // n_cuts
995+
996+
# Get the cut points
997+
cut_points = [cut_size * i for i in range(1, n_cuts)]
998+
999+
return cut_points
1000+
1001+
7751002
def _get_column_names(data: FrameT | Any, ibis_tbl: bool, df_lib_name_gt: str) -> list[str]:
7761003
if ibis_tbl:
7771004
return data.columns if df_lib_name_gt == "polars" else list(data.columns)

0 commit comments

Comments
 (0)