|
69 | 69 | )
|
70 | 70 | from pointblank._utils_html import _create_table_type_html, _create_table_dims_html
|
71 | 71 |
|
72 |
| -__all__ = ["Validate", "load_dataset", "config", "preview", "get_column_count", "get_row_count"] |
| 72 | +__all__ = [ |
| 73 | + "Validate", |
| 74 | + "load_dataset", |
| 75 | + "config", |
| 76 | + "preview", |
| 77 | + "missing_vals_tbl", |
| 78 | + "get_column_count", |
| 79 | + "get_row_count", |
| 80 | +] |
73 | 81 |
|
74 | 82 |
|
75 | 83 | @dataclass
|
@@ -772,6 +780,225 @@ def _generate_display_table(
|
772 | 780 | return gt_tbl
|
773 | 781 |
|
774 | 782 |
|
| 783 | +def missing_vals_tbl(data: FrameT | Any) -> GT: |
| 784 | + """ |
| 785 | + Display a table that shows the missing values in the input table. |
| 786 | +
|
| 787 | + The `missing_vals_tbl()` function generates a table that shows the missing values in the input |
| 788 | + table. The table is displayed using the Great Tables (`GT`) API, which allows for further |
| 789 | + customization of the table's appearance if so desired. |
| 790 | +
|
| 791 | + Parameters |
| 792 | + ---------- |
| 793 | + data |
| 794 | + The table for which to display the missing values. This could be a DataFrame object or an |
| 795 | + Ibis table object. Read the *Supported Input Table Types* section for details on the |
| 796 | + supported table types. |
| 797 | +
|
| 798 | + Returns |
| 799 | + ------- |
| 800 | + GT |
| 801 | + A GT object that displays the table of missing values in the input table. |
| 802 | +
|
| 803 | + Supported Input Table Types |
| 804 | + --------------------------- |
| 805 | + The `data=` parameter can be given any of the following table types: |
| 806 | +
|
| 807 | + - Polars DataFrame (`"polars"`) |
| 808 | + - Pandas DataFrame (`"pandas"`) |
| 809 | + - DuckDB table (`"duckdb"`)* |
| 810 | + - MySQL table (`"mysql"`)* |
| 811 | + - PostgreSQL table (`"postgresql"`)* |
| 812 | + - SQLite table (`"sqlite"`)* |
| 813 | + - Parquet table (`"parquet"`)* |
| 814 | +
|
| 815 | + The table types marked with an asterisk need to be prepared as Ibis tables (with type of |
| 816 | + `ibis.expr.types.relations.Table`). Furthermore, using `missing_vals_tbl()` with these types of |
| 817 | + tables requires the Ibis library (`v9.5.0` or above) to be installed. If the input table is a |
| 818 | + Polars or Pandas DataFrame, the availability of Ibis is not needed. |
| 819 | + """ |
| 820 | + |
| 821 | + # Make a copy of the data to avoid modifying the original |
| 822 | + data = copy.deepcopy(data) |
| 823 | + |
| 824 | + # Get the number of rows in the table |
| 825 | + n_rows = get_row_count(data) |
| 826 | + |
| 827 | + # Determine if the table is a DataFrame or an Ibis table |
| 828 | + tbl_type = _get_tbl_type(data=data) |
| 829 | + ibis_tbl = "ibis.expr.types.relations.Table" in str(type(data)) |
| 830 | + pl_pb_tbl = "polars" in tbl_type or "pandas" in tbl_type |
| 831 | + |
| 832 | + # Select the DataFrame library to use for displaying the Ibis table |
| 833 | + df_lib_gt = _select_df_lib(preference="polars") |
| 834 | + df_lib_name_gt = df_lib_gt.__name__ |
| 835 | + |
| 836 | + # If the table is a DataFrame (Pandas or Polars), set `df_lib_name_gt` to the name of the |
| 837 | + # library (e.g., "polars" or "pandas") |
| 838 | + if pl_pb_tbl: |
| 839 | + df_lib_name_gt = "polars" if "polars" in tbl_type else "pandas" |
| 840 | + |
| 841 | + # Handle imports of Polars or Pandas here |
| 842 | + if df_lib_name_gt == "polars": |
| 843 | + import polars as pl |
| 844 | + else: |
| 845 | + import pandas as pd |
| 846 | + |
| 847 | + # From an Ibis table: |
| 848 | + # - get the row count |
| 849 | + # - get 10 cut points for table preview, these are row numbers used as buckets for determining |
| 850 | + # the proportion of missing values in each 'sector' in each column |
| 851 | + if ibis_tbl: |
| 852 | + |
| 853 | + # Get the row count for the table |
| 854 | + ibis_rows = data.count() |
| 855 | + n_rows = ibis_rows.to_polars() if df_lib_name_gt == "polars" else int(ibis_rows.to_pandas()) |
| 856 | + |
| 857 | + # Get the column names from the table |
| 858 | + col_names = list(data.columns) |
| 859 | + |
| 860 | + n_cut_points = 11 |
| 861 | + |
| 862 | + # Get the cut points for the table preview |
| 863 | + cut_points = _get_cut_points(n_rows=n_rows, n_cuts=n_cut_points) |
| 864 | + |
| 865 | + # Iterate over the cut points and get the proportion of missing values in each 'sector' |
| 866 | + # for each column |
| 867 | + missing_vals = { |
| 868 | + col: [ |
| 869 | + ( |
| 870 | + data[(cut_points[i] - 1) : cut_points[i]][col].isnull().sum().to_polars() |
| 871 | + / (cut_points[i] - (cut_points[i - 1] if i > 0 else 0)) |
| 872 | + if cut_points[i] > (cut_points[i - 1] if i > 0 else 0) |
| 873 | + else 0 |
| 874 | + ) |
| 875 | + for i in range(len(cut_points)) |
| 876 | + ] |
| 877 | + for col in data.columns |
| 878 | + } |
| 879 | + |
| 880 | + # Get a dictionary of counts of missing values in each column |
| 881 | + missing_val_counts = {col: data[col].isnull().sum().to_polars() for col in data.columns} |
| 882 | + |
| 883 | + missing_vals = { |
| 884 | + "columns": list(missing_vals.keys()), |
| 885 | + **{ |
| 886 | + str(i + 1): [missing_vals[col][i] for col in missing_vals.keys()] for i in range(10) |
| 887 | + }, |
| 888 | + } |
| 889 | + |
| 890 | + # From `missing_vals`, create a DataFrame with the missing value proportions |
| 891 | + if df_lib_name_gt == "polars": |
| 892 | + |
| 893 | + import polars as pl |
| 894 | + |
| 895 | + # Create a Polars DataFrame from the `missing_vals` dictionary |
| 896 | + missing_vals_df = pl.DataFrame(missing_vals) |
| 897 | + |
| 898 | + # Get a count of total missing values |
| 899 | + n_missing_total = sum(missing_val_counts.values()) |
| 900 | + |
| 901 | + # Create the label, table type, and thresholds HTML fragments |
| 902 | + table_type_html = _create_table_type_html(tbl_type=tbl_type, tbl_name=None, font_size="10px") |
| 903 | + |
| 904 | + tbl_dims_html = _create_table_dims_html(columns=len(col_names), rows=n_rows, font_size="10px") |
| 905 | + |
| 906 | + check_mark = '<span style="color:#4CA64C;">✓</span>' |
| 907 | + |
| 908 | + # Compose the title HTML fragment |
| 909 | + if n_missing_total == 0: |
| 910 | + combined_title = f"Missing Values {check_mark}" |
| 911 | + else: |
| 912 | + combined_title = ( |
| 913 | + "Missing Values <span style='font-size: 14px; " |
| 914 | + f"text-transform: uppercase; color: #333333'>{n_missing_total} in total</span>" |
| 915 | + ) |
| 916 | + |
| 917 | + # Compose the subtitle HTML fragment |
| 918 | + combined_subtitle = ( |
| 919 | + "<div>" |
| 920 | + '<div style="padding-top: 0; padding-bottom: 7px;">' |
| 921 | + f"{table_type_html}" |
| 922 | + f"{tbl_dims_html}" |
| 923 | + "</div>" |
| 924 | + "</div>" |
| 925 | + ) |
| 926 | + |
| 927 | + import polars.selectors as cs |
| 928 | + |
| 929 | + missing_vals_tbl = ( |
| 930 | + GT(missing_vals_df) |
| 931 | + .tab_header(title=html(combined_title), subtitle=html(combined_subtitle)) |
| 932 | + .opt_table_font(font=google_font(name="IBM Plex Sans")) |
| 933 | + .opt_align_table_header(align="left") |
| 934 | + .cols_label(columns="Column") |
| 935 | + .cols_width( |
| 936 | + cases={ |
| 937 | + "columns": "200px", |
| 938 | + "1": "30px", |
| 939 | + "2": "30px", |
| 940 | + "3": "30px", |
| 941 | + "4": "30px", |
| 942 | + "5": "30px", |
| 943 | + "6": "30px", |
| 944 | + "7": "30px", |
| 945 | + "8": "30px", |
| 946 | + "9": "30px", |
| 947 | + "10": "30px", |
| 948 | + } |
| 949 | + ) |
| 950 | + .cols_align(align="center", columns=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]) |
| 951 | + .data_color( |
| 952 | + columns=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"], |
| 953 | + palette=["#F5F5F5", "#000000"], |
| 954 | + domain=[0, 1], |
| 955 | + ) |
| 956 | + .tab_style( |
| 957 | + style=style.borders( |
| 958 | + sides=["left", "right"], color="#F0F0F0", style="solid", weight="1px" |
| 959 | + ), |
| 960 | + locations=loc.body(columns=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]), |
| 961 | + ) |
| 962 | + .tab_style( |
| 963 | + style=style.css( |
| 964 | + "height: 20px; padding: 4px; white-space: nowrap; text-overflow: " |
| 965 | + "ellipsis; overflow: hidden;" |
| 966 | + ), |
| 967 | + locations=loc.body(), |
| 968 | + ) |
| 969 | + .tab_style( |
| 970 | + style=style.text(color="black", font=google_font(name="IBM Plex Mono"), size="12px"), |
| 971 | + locations=loc.body(), |
| 972 | + ) |
| 973 | + .tab_style( |
| 974 | + style=style.text(color="black", size="16px"), |
| 975 | + locations=loc.column_labels(), |
| 976 | + ) |
| 977 | + .fmt(fns=lambda x: "", columns=["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]) |
| 978 | + .tab_style(style=style.fill(color="lightblue"), locations=loc.body(mask=cs.numeric().eq(0))) |
| 979 | + ) |
| 980 | + |
| 981 | + return missing_vals_tbl |
| 982 | + |
| 983 | + |
| 984 | +def _get_cut_points(n_rows: int, n_cuts: int) -> list[int]: |
| 985 | + """ |
| 986 | + Get the cut points for a table preview. |
| 987 | +
|
| 988 | + For a certain number of rows and a certain number of cuts, get the cut points, which are integer |
| 989 | + values that divide the rows into equal parts (all parts don't have to be equal, but the cut |
| 990 | + points should be as close to equal as possible and add to the total number of rows). |
| 991 | + """ |
| 992 | + |
| 993 | + # Get the number of rows in each cut |
| 994 | + cut_size = n_rows // n_cuts |
| 995 | + |
| 996 | + # Get the cut points |
| 997 | + cut_points = [cut_size * i for i in range(1, n_cuts)] |
| 998 | + |
| 999 | + return cut_points |
| 1000 | + |
| 1001 | + |
775 | 1002 | def _get_column_names(data: FrameT | Any, ibis_tbl: bool, df_lib_name_gt: str) -> list[str]:
|
776 | 1003 | if ibis_tbl:
|
777 | 1004 | return data.columns if df_lib_name_gt == "polars" else list(data.columns)
|
|
0 commit comments