Skip to content

Commit f0aba63

Browse files
committed
Rework the missing_vals_tbl() function
1 parent d142ef1 commit f0aba63

File tree

1 file changed

+142
-23
lines changed

1 file changed

+142
-23
lines changed

pointblank/validate.py

Lines changed: 142 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -825,11 +825,14 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
825825
n_rows = get_row_count(data)
826826

827827
# Define the number of cut points for the missing values table
828-
n_cut_points = 11
828+
n_cut_points = 9
829829

830830
# Get the cut points for the table preview
831831
cut_points = _get_cut_points(n_rows=n_rows, n_cuts=n_cut_points)
832832

833+
# Get the row ranges for the table
834+
row_ranges = _get_row_ranges(cut_points=cut_points, n_rows=n_rows)
835+
833836
# Determine if the table is a DataFrame or an Ibis table
834837
tbl_type = _get_tbl_type(data=data)
835838
ibis_tbl = "ibis.expr.types.relations.Table" in str(type(data))
@@ -859,26 +862,40 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
859862
# Get the column names from the table
860863
col_names = list(data.columns)
861864

862-
# Iterate over the cut points and get the proportion of missing values in each 'sector'
863-
# for each column
865+
# Use the `row_ranges` list of lists to query, for each column, the proportion of missing
866+
# values in each 'sector' of the table (a sector is a range of rows)
864867
missing_vals = {
865868
col: [
866869
(
867-
data[(cut_points[i] - 1) : cut_points[i]][col].isnull().sum().to_polars()
870+
data[(cut_points[i - 1] if i > 0 else 0) : cut_points[i]][col]
871+
.isnull()
872+
.sum()
873+
.to_polars()
868874
/ (cut_points[i] - (cut_points[i - 1] if i > 0 else 0))
875+
* 100
869876
if cut_points[i] > (cut_points[i - 1] if i > 0 else 0)
870877
else 0
871878
)
872879
for i in range(len(cut_points))
873880
]
881+
+ [
882+
(
883+
data[cut_points[-1] : n_rows][col].isnull().sum().to_polars()
884+
/ (n_rows - cut_points[-1])
885+
* 100
886+
if n_rows > cut_points[-1]
887+
else 0
888+
)
889+
]
874890
for col in data.columns
875891
}
876892

877893
# Pivot the `missing_vals` dictionary to create a table with the missing value proportions
878894
missing_vals = {
879895
"columns": list(missing_vals.keys()),
880896
**{
881-
str(i + 1): [missing_vals[col][i] for col in missing_vals.keys()] for i in range(10)
897+
str(i + 1): [missing_vals[col][i] for col in missing_vals.keys()]
898+
for i in range(len(cut_points) + 1)
882899
},
883900
}
884901

@@ -898,23 +915,33 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
898915
missing_vals = {
899916
col: [
900917
(
901-
data[(cut_points[i] - 1) : cut_points[i]][col].is_null().sum()
918+
data[(cut_points[i - 1] if i > 0 else 0) : cut_points[i]][col]
919+
.is_null()
920+
.sum()
902921
/ (cut_points[i] - (cut_points[i - 1] if i > 0 else 0))
922+
* 100
903923
if cut_points[i] > (cut_points[i - 1] if i > 0 else 0)
904924
else 0
905925
)
906926
for i in range(len(cut_points))
907927
]
928+
+ [
929+
(
930+
data[cut_points[-1] : n_rows][col].is_null().sum()
931+
/ (n_rows - cut_points[-1])
932+
* 100
933+
if n_rows > cut_points[-1]
934+
else 0
935+
)
936+
]
908937
for col in data.columns
909938
}
910939

911-
# Pivot the `missing_vals` dictionary to create a table with the missing
912-
# value proportions
913940
missing_vals = {
914941
"columns": list(missing_vals.keys()),
915942
**{
916943
str(i + 1): [missing_vals[col][i] for col in missing_vals.keys()]
917-
for i in range(10)
944+
for i in range(len(cut_points) + 1)
918945
},
919946
}
920947

@@ -923,17 +950,28 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
923950

924951
if "pandas" in tbl_type:
925952

926-
# Pandas case (for this case, if final values are zero then use pd.NA)
927953
missing_vals = {
928954
col: [
929955
(
930-
data[(cut_points[i] - 1) : cut_points[i]][col].isnull().sum()
956+
data[(cut_points[i - 1] if i > 0 else 0) : cut_points[i]][col]
957+
.isnull()
958+
.sum()
931959
/ (cut_points[i] - (cut_points[i - 1] if i > 0 else 0))
960+
* 100
932961
if cut_points[i] > (cut_points[i - 1] if i > 0 else 0)
933962
else 0
934963
)
935964
for i in range(len(cut_points))
936965
]
966+
+ [
967+
(
968+
data[cut_points[-1] : n_rows][col].isnull().sum()
969+
/ (n_rows - cut_points[-1])
970+
* 100
971+
if n_rows > cut_points[-1]
972+
else 0
973+
)
974+
]
937975
for col in data.columns
938976
}
939977

@@ -943,7 +981,7 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
943981
"columns": list(missing_vals.keys()),
944982
**{
945983
str(i + 1): [missing_vals[col][i] for col in missing_vals.keys()]
946-
for i in range(10)
984+
for i in range(len(cut_points) + 1)
947985
},
948986
}
949987

@@ -994,6 +1032,38 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
9941032
"</div>"
9951033
)
9961034

1035+
# Get the row ranges for the table
1036+
row_ranges = _get_row_ranges(cut_points=cut_points, n_rows=n_rows)
1037+
1038+
row_ranges_html = (
1039+
"<div style='font-size: 8px;'><ol style='margin-top: 2px; margin-left: -15px;'>"
1040+
+ "".join(
1041+
[f"<li>{row_range[0]} &ndash; {row_range[1]}</li>" for row_range in zip(*row_ranges)]
1042+
)
1043+
+ "</ol></div>"
1044+
)
1045+
1046+
details_html = (
1047+
"<details style='cursor: pointer; font-size: 12px;'><summary style='font-size: 10px; color: #333333;'>ROW SECTORS</summary>"
1048+
f"{row_ranges_html}"
1049+
"</details>"
1050+
)
1051+
1052+
# Compose the footer HTML fragment
1053+
combined_footer = (
1054+
"<div style='display: flex; align-items: center; padding-bottom: 10px;'><div style='width: 20px; height: 20px; "
1055+
"background-color: lightblue; border: 1px solid #E0E0E0; margin-right: 3px;'></div>"
1056+
"<span style='font-size: 10px;'>NO MISSING VALUES</span><span style='font-size: 10px;'>"
1057+
"&nbsp;&nbsp;&nbsp;&nbsp; PROPORTION MISSING:&nbsp;&nbsp;</span>"
1058+
"<div style='font-size: 10px; color: #333333;'>0%</div><div style='width: 80px; "
1059+
"height: 20px; background: linear-gradient(to right, #F5F5F5, #000000); "
1060+
"border: 1px solid #E0E0E0; margin-right: 2px; margin-left: 2px'></div>"
1061+
"<div style='font-size: 10px; color: #333333;'>100%</div></div>"
1062+
f"{details_html}"
1063+
)
1064+
1065+
sector_list = [str(i) for i in range(1, n_cut_points + 2)]
1066+
9971067
missing_vals_tbl = (
9981068
GT(missing_vals_df)
9991069
.tab_header(title=html(combined_title), subtitle=html(combined_subtitle))
@@ -1015,17 +1085,18 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
10151085
"10": "30px",
10161086
}
10171087
)
1018-
.cols_align(align="center", columns=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"])
1088+
.tab_spanner(label="Row Sector", columns=sector_list)
1089+
.cols_align(align="center", columns=sector_list)
10191090
.data_color(
1020-
columns=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
1091+
columns=sector_list,
10211092
palette=["#F5F5F5", "#000000"],
10221093
domain=[0, 1],
10231094
)
10241095
.tab_style(
10251096
style=style.borders(
10261097
sides=["left", "right"], color="#F0F0F0", style="solid", weight="1px"
10271098
),
1028-
locations=loc.body(columns=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]),
1099+
locations=loc.body(columns=sector_list),
10291100
)
10301101
.tab_style(
10311102
style=style.css(
@@ -1042,7 +1113,8 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
10421113
style=style.text(color="black", size="16px"),
10431114
locations=loc.column_labels(),
10441115
)
1045-
.fmt(fns=lambda x: "", columns=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"])
1116+
.fmt(fns=lambda x: "", columns=sector_list)
1117+
.tab_source_note(source_note=html(combined_footer))
10461118
)
10471119

10481120
#
@@ -1075,22 +1147,69 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
10751147

10761148
def _get_cut_points(n_rows: int, n_cuts: int) -> list[int]:
10771149
"""
1078-
Get the cut points for a table preview.
1150+
Get the cut points for a table.
1151+
1152+
For a given number of rows and cuts, get the cut points for the table. The cut points are
1153+
evenly spaced in the range from 1 to n_rows, excluding the first and last points.
1154+
1155+
Parameters
1156+
----------
1157+
n_rows
1158+
The total number of rows in the table.
1159+
n_cuts
1160+
The number of cuts to divide the table into.
10791161
1080-
For a certain number of rows and a certain number of cuts, get the cut points, which are integer
1081-
values that divide the rows into equal parts (all parts don't have to be equal, but the cut
1082-
points should be as close to equal as possible and add to the total number of rows).
1162+
Returns
1163+
-------
1164+
list[int]
1165+
A list of integer values that represent the cut points for the table.
10831166
"""
10841167

1085-
# Get the number of rows in each cut
1086-
cut_size = n_rows // n_cuts
1168+
# Calculate the step size
1169+
step_size = n_rows // (n_cuts + 1)
10871170

10881171
# Get the cut points
1089-
cut_points = [cut_size * i for i in range(1, n_cuts)]
1172+
cut_points = [step_size * i for i in range(1, n_cuts + 1)]
10901173

10911174
return cut_points
10921175

10931176

1177+
def _get_row_ranges(cut_points: list[int], n_rows: int) -> list[list[int]]:
1178+
"""
1179+
Get the row ranges for a missing values table.
1180+
1181+
For a list of cut points, get the row ranges for a missing values table. The row ranges are
1182+
formatted as lists of integers like [1, 10], [11, 20], etc.
1183+
1184+
Parameters
1185+
----------
1186+
cut_points
1187+
A list of integer values that represent the cut points for the table.
1188+
1189+
Returns
1190+
-------
1191+
list[list[int]]
1192+
A list of lists that represent the row ranges for the table.
1193+
"""
1194+
row_ranges = []
1195+
1196+
for i in range(len(cut_points)):
1197+
if i == 0:
1198+
row_ranges.append([1, cut_points[i]])
1199+
else:
1200+
row_ranges.append([cut_points[i - 1] + 1, cut_points[i]])
1201+
1202+
# Add the final range to incorporate n_rows
1203+
if cut_points[-1] < n_rows:
1204+
row_ranges.append([cut_points[-1] + 1, n_rows])
1205+
1206+
# Split the row ranges into two lists: LHS and RHS
1207+
lhs_values = [pair[0] for pair in row_ranges]
1208+
rhs_values = [pair[1] for pair in row_ranges]
1209+
1210+
return [lhs_values, rhs_values]
1211+
1212+
10941213
def _get_column_names(data: FrameT | Any, ibis_tbl: bool, df_lib_name_gt: str) -> list[str]:
10951214
if ibis_tbl:
10961215
return data.columns if df_lib_name_gt == "polars" else list(data.columns)

0 commit comments

Comments
 (0)