@@ -825,11 +825,14 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
825
825
n_rows = get_row_count (data )
826
826
827
827
# Define the number of cut points for the missing values table
828
- n_cut_points = 11
828
+ n_cut_points = 9
829
829
830
830
# Get the cut points for the table preview
831
831
cut_points = _get_cut_points (n_rows = n_rows , n_cuts = n_cut_points )
832
832
833
+ # Get the row ranges for the table
834
+ row_ranges = _get_row_ranges (cut_points = cut_points , n_rows = n_rows )
835
+
833
836
# Determine if the table is a DataFrame or an Ibis table
834
837
tbl_type = _get_tbl_type (data = data )
835
838
ibis_tbl = "ibis.expr.types.relations.Table" in str (type (data ))
@@ -859,26 +862,40 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
859
862
# Get the column names from the table
860
863
col_names = list (data .columns )
861
864
862
- # Iterate over the cut points and get the proportion of missing values in each 'sector'
863
- # for each column
865
+ # Use the `row_ranges` list of lists to query, for each column, the proportion of missing
866
+ # values in each 'sector' of the table (a sector is a range of rows)
864
867
missing_vals = {
865
868
col : [
866
869
(
867
- data [(cut_points [i ] - 1 ) : cut_points [i ]][col ].isnull ().sum ().to_polars ()
870
+ data [(cut_points [i - 1 ] if i > 0 else 0 ) : cut_points [i ]][col ]
871
+ .isnull ()
872
+ .sum ()
873
+ .to_polars ()
868
874
/ (cut_points [i ] - (cut_points [i - 1 ] if i > 0 else 0 ))
875
+ * 100
869
876
if cut_points [i ] > (cut_points [i - 1 ] if i > 0 else 0 )
870
877
else 0
871
878
)
872
879
for i in range (len (cut_points ))
873
880
]
881
+ + [
882
+ (
883
+ data [cut_points [- 1 ] : n_rows ][col ].isnull ().sum ().to_polars ()
884
+ / (n_rows - cut_points [- 1 ])
885
+ * 100
886
+ if n_rows > cut_points [- 1 ]
887
+ else 0
888
+ )
889
+ ]
874
890
for col in data .columns
875
891
}
876
892
877
893
# Pivot the `missing_vals` dictionary to create a table with the missing value proportions
878
894
missing_vals = {
879
895
"columns" : list (missing_vals .keys ()),
880
896
** {
881
- str (i + 1 ): [missing_vals [col ][i ] for col in missing_vals .keys ()] for i in range (10 )
897
+ str (i + 1 ): [missing_vals [col ][i ] for col in missing_vals .keys ()]
898
+ for i in range (len (cut_points ) + 1 )
882
899
},
883
900
}
884
901
@@ -898,23 +915,33 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
898
915
missing_vals = {
899
916
col : [
900
917
(
901
- data [(cut_points [i ] - 1 ) : cut_points [i ]][col ].is_null ().sum ()
918
+ data [(cut_points [i - 1 ] if i > 0 else 0 ) : cut_points [i ]][col ]
919
+ .is_null ()
920
+ .sum ()
902
921
/ (cut_points [i ] - (cut_points [i - 1 ] if i > 0 else 0 ))
922
+ * 100
903
923
if cut_points [i ] > (cut_points [i - 1 ] if i > 0 else 0 )
904
924
else 0
905
925
)
906
926
for i in range (len (cut_points ))
907
927
]
928
+ + [
929
+ (
930
+ data [cut_points [- 1 ] : n_rows ][col ].is_null ().sum ()
931
+ / (n_rows - cut_points [- 1 ])
932
+ * 100
933
+ if n_rows > cut_points [- 1 ]
934
+ else 0
935
+ )
936
+ ]
908
937
for col in data .columns
909
938
}
910
939
911
- # Pivot the `missing_vals` dictionary to create a table with the missing
912
- # value proportions
913
940
missing_vals = {
914
941
"columns" : list (missing_vals .keys ()),
915
942
** {
916
943
str (i + 1 ): [missing_vals [col ][i ] for col in missing_vals .keys ()]
917
- for i in range (10 )
944
+ for i in range (len ( cut_points ) + 1 )
918
945
},
919
946
}
920
947
@@ -923,17 +950,28 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
923
950
924
951
if "pandas" in tbl_type :
925
952
926
- # Pandas case (for this case, if final values are zero then use pd.NA)
927
953
missing_vals = {
928
954
col : [
929
955
(
930
- data [(cut_points [i ] - 1 ) : cut_points [i ]][col ].isnull ().sum ()
956
+ data [(cut_points [i - 1 ] if i > 0 else 0 ) : cut_points [i ]][col ]
957
+ .isnull ()
958
+ .sum ()
931
959
/ (cut_points [i ] - (cut_points [i - 1 ] if i > 0 else 0 ))
960
+ * 100
932
961
if cut_points [i ] > (cut_points [i - 1 ] if i > 0 else 0 )
933
962
else 0
934
963
)
935
964
for i in range (len (cut_points ))
936
965
]
966
+ + [
967
+ (
968
+ data [cut_points [- 1 ] : n_rows ][col ].isnull ().sum ()
969
+ / (n_rows - cut_points [- 1 ])
970
+ * 100
971
+ if n_rows > cut_points [- 1 ]
972
+ else 0
973
+ )
974
+ ]
937
975
for col in data .columns
938
976
}
939
977
@@ -943,7 +981,7 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
943
981
"columns" : list (missing_vals .keys ()),
944
982
** {
945
983
str (i + 1 ): [missing_vals [col ][i ] for col in missing_vals .keys ()]
946
- for i in range (10 )
984
+ for i in range (len ( cut_points ) + 1 )
947
985
},
948
986
}
949
987
@@ -994,6 +1032,38 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
994
1032
"</div>"
995
1033
)
996
1034
1035
+ # Get the row ranges for the table
1036
+ row_ranges = _get_row_ranges (cut_points = cut_points , n_rows = n_rows )
1037
+
1038
+ row_ranges_html = (
1039
+ "<div style='font-size: 8px;'><ol style='margin-top: 2px; margin-left: -15px;'>"
1040
+ + "" .join (
1041
+ [f"<li>{ row_range [0 ]} – { row_range [1 ]} </li>" for row_range in zip (* row_ranges )]
1042
+ )
1043
+ + "</ol></div>"
1044
+ )
1045
+
1046
+ details_html = (
1047
+ "<details style='cursor: pointer; font-size: 12px;'><summary style='font-size: 10px; color: #333333;'>ROW SECTORS</summary>"
1048
+ f"{ row_ranges_html } "
1049
+ "</details>"
1050
+ )
1051
+
1052
+ # Compose the footer HTML fragment
1053
+ combined_footer = (
1054
+ "<div style='display: flex; align-items: center; padding-bottom: 10px;'><div style='width: 20px; height: 20px; "
1055
+ "background-color: lightblue; border: 1px solid #E0E0E0; margin-right: 3px;'></div>"
1056
+ "<span style='font-size: 10px;'>NO MISSING VALUES</span><span style='font-size: 10px;'>"
1057
+ " PROPORTION MISSING: </span>"
1058
+ "<div style='font-size: 10px; color: #333333;'>0%</div><div style='width: 80px; "
1059
+ "height: 20px; background: linear-gradient(to right, #F5F5F5, #000000); "
1060
+ "border: 1px solid #E0E0E0; margin-right: 2px; margin-left: 2px'></div>"
1061
+ "<div style='font-size: 10px; color: #333333;'>100%</div></div>"
1062
+ f"{ details_html } "
1063
+ )
1064
+
1065
+ sector_list = [str (i ) for i in range (1 , n_cut_points + 2 )]
1066
+
997
1067
missing_vals_tbl = (
998
1068
GT (missing_vals_df )
999
1069
.tab_header (title = html (combined_title ), subtitle = html (combined_subtitle ))
@@ -1015,17 +1085,18 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
1015
1085
"10" : "30px" ,
1016
1086
}
1017
1087
)
1018
- .cols_align (align = "center" , columns = ["1" , "2" , "3" , "4" , "5" , "6" , "7" , "8" , "9" , "10" ])
1088
+ .tab_spanner (label = "Row Sector" , columns = sector_list )
1089
+ .cols_align (align = "center" , columns = sector_list )
1019
1090
.data_color (
1020
- columns = [ "1" , "2" , "3" , "4" , "5" , "6" , "7" , "8" , "9" , "10" ] ,
1091
+ columns = sector_list ,
1021
1092
palette = ["#F5F5F5" , "#000000" ],
1022
1093
domain = [0 , 1 ],
1023
1094
)
1024
1095
.tab_style (
1025
1096
style = style .borders (
1026
1097
sides = ["left" , "right" ], color = "#F0F0F0" , style = "solid" , weight = "1px"
1027
1098
),
1028
- locations = loc .body (columns = [ "1" , "2" , "3" , "4" , "5" , "6" , "7" , "8" , "9" , "10" ] ),
1099
+ locations = loc .body (columns = sector_list ),
1029
1100
)
1030
1101
.tab_style (
1031
1102
style = style .css (
@@ -1042,7 +1113,8 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
1042
1113
style = style .text (color = "black" , size = "16px" ),
1043
1114
locations = loc .column_labels (),
1044
1115
)
1045
- .fmt (fns = lambda x : "" , columns = ["1" , "2" , "3" , "4" , "5" , "6" , "7" , "8" , "9" , "10" ])
1116
+ .fmt (fns = lambda x : "" , columns = sector_list )
1117
+ .tab_source_note (source_note = html (combined_footer ))
1046
1118
)
1047
1119
1048
1120
#
@@ -1075,22 +1147,69 @@ def missing_vals_tbl(data: FrameT | Any) -> GT:
1075
1147
1076
1148
def _get_cut_points (n_rows : int , n_cuts : int ) -> list [int ]:
1077
1149
"""
1078
- Get the cut points for a table preview.
1150
+ Get the cut points for a table.
1151
+
1152
+ For a given number of rows and cuts, get the cut points for the table. The cut points are
1153
+ evenly spaced in the range from 1 to n_rows, excluding the first and last points.
1154
+
1155
+ Parameters
1156
+ ----------
1157
+ n_rows
1158
+ The total number of rows in the table.
1159
+ n_cuts
1160
+ The number of cuts to divide the table into.
1079
1161
1080
- For a certain number of rows and a certain number of cuts, get the cut points, which are integer
1081
- values that divide the rows into equal parts (all parts don't have to be equal, but the cut
1082
- points should be as close to equal as possible and add to the total number of rows).
1162
+ Returns
1163
+ -------
1164
+ list[int]
1165
+ A list of integer values that represent the cut points for the table.
1083
1166
"""
1084
1167
1085
- # Get the number of rows in each cut
1086
- cut_size = n_rows // n_cuts
1168
+ # Calculate the step size
1169
+ step_size = n_rows // ( n_cuts + 1 )
1087
1170
1088
1171
# Get the cut points
1089
- cut_points = [cut_size * i for i in range (1 , n_cuts )]
1172
+ cut_points = [step_size * i for i in range (1 , n_cuts + 1 )]
1090
1173
1091
1174
return cut_points
1092
1175
1093
1176
1177
+ def _get_row_ranges (cut_points : list [int ], n_rows : int ) -> list [list [int ]]:
1178
+ """
1179
+ Get the row ranges for a missing values table.
1180
+
1181
+ For a list of cut points, get the row ranges for a missing values table. The row ranges are
1182
+ formatted as lists of integers like [1, 10], [11, 20], etc.
1183
+
1184
+ Parameters
1185
+ ----------
1186
+ cut_points
1187
+ A list of integer values that represent the cut points for the table.
1188
+
1189
+ Returns
1190
+ -------
1191
+ list[list[int]]
1192
+ A list of lists that represent the row ranges for the table.
1193
+ """
1194
+ row_ranges = []
1195
+
1196
+ for i in range (len (cut_points )):
1197
+ if i == 0 :
1198
+ row_ranges .append ([1 , cut_points [i ]])
1199
+ else :
1200
+ row_ranges .append ([cut_points [i - 1 ] + 1 , cut_points [i ]])
1201
+
1202
+ # Add the final range to incorporate n_rows
1203
+ if cut_points [- 1 ] < n_rows :
1204
+ row_ranges .append ([cut_points [- 1 ] + 1 , n_rows ])
1205
+
1206
+ # Split the row ranges into two lists: LHS and RHS
1207
+ lhs_values = [pair [0 ] for pair in row_ranges ]
1208
+ rhs_values = [pair [1 ] for pair in row_ranges ]
1209
+
1210
+ return [lhs_values , rhs_values ]
1211
+
1212
+
1094
1213
def _get_column_names (data : FrameT | Any , ibis_tbl : bool , df_lib_name_gt : str ) -> list [str ]:
1095
1214
if ibis_tbl :
1096
1215
return data .columns if df_lib_name_gt == "polars" else list (data .columns )
0 commit comments