Skip to content

Commit a065ea7

Browse files
authored
Merge pull request #161 from scikit-learn-contrib/pklm
PKLM Implementation
2 parents 4b3de91 + 14e1490 commit a065ea7

File tree

12 files changed

+1303
-61
lines changed

12 files changed

+1303
-61
lines changed

docs/analysis.rst

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ The analysis module provides tools to characterize the type of holes.
1111

1212
The MNAR case is the trickiest, the user must first consider whether their missing data mechanism is MNAR. In the meantime, we make assume that the missing-data mechanism is ignorable (ie., it is not MNAR). If an MNAR mechanism is suspected, please see this article :ref:`An approach to test for MNAR [1]<Noonan-article>` for relevant actions.
1313

14-
Then Qolmat proposes a test to determine whether the missing data mechanism is MCAR or MAR.
14+
Then Qolmat proposes two tests to determine whether the missing data mechanism is MCAR or MAR.
1515

1616
2. How to use the results
1717
-------------------------
@@ -45,12 +45,16 @@ The MCAR missing-data mechanism means that there is independence between the pre
4545
a. Little's Test
4646
^^^^^^^^^^^^^^^^
4747

48-
The best-known MCAR test is the :ref:`Little [2]<Little-article>` test, and it has been implemented in :class:`LittleTest`. Keep in mind that the Little's test is designed to test the homogeneity of means across the missing patterns and won't be efficient to detect the heterogeneity of covariance accross missing patterns.
48+
The best-known MCAR test is the :ref:`Little [1]<Little-article>` test, and it has been implemented in :class:`LittleTest`. Keep in mind that the Little's test is designed to test the homogeneity of means across the missing patterns and won't be efficient to detect the heterogeneity of covariance accross missing patterns.
4949

5050
b. PKLM Test
5151
^^^^^^^^^^^^
5252

53-
The :ref:`PKLM [2]<PKLM-article>` (Projected Kullback-Leibler MCAR) test compares the distributions of different missing patterns on random projections in the variable space of the data. This recent test applies to mixed-type data. It is not implemented yet in Qolmat.
53+
The :ref:`PKLM [2]<PKLM-article>` (Projected Kullback-Leibler MCAR) test compares the distributions of different missing patterns on random projections in the variable space of the data. This recent test applies to mixed-type data. The :class:`PKLMTest` is now implemented in Qolmat.
54+
To carry out this test, we perform random projections in the variable space of the data. These random projections allow us to construct a fully observed sub-matrix and an associated number of missing patterns.
55+
The idea is then to compare the distributions of the missing patterns through the Kullback-Leibler distance.
56+
To do this, the distributions for each pattern are estimated using Random Forests.
57+
5458

5559
References
5660
----------

docs/images/schema_qolmat.png

5.35 KB
Loading

examples/tutorials/plot_tuto_benchmark_TS.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,9 @@
7878
ratio_masked = 0.1
7979

8080
imputer_median = imputers.ImputerSimple(groups=("station",), strategy="median")
81-
imputer_interpol = imputers.ImputerInterpolation(groups=("station",), method="linear")
81+
imputer_interpol = imputers.ImputerInterpolation(
82+
groups=("station",), method="linear"
83+
)
8284
imputer_residuals = imputers.ImputerResiduals(
8385
groups=("station",),
8486
period=365,
@@ -103,7 +105,10 @@
103105
)
104106

105107
generator_holes = missing_patterns.EmpiricalHoleGenerator(
106-
n_splits=4, groups=("station",), subset=cols_to_impute, ratio_masked=ratio_masked
108+
n_splits=4,
109+
groups=("station",),
110+
subset=cols_to_impute,
111+
ratio_masked=ratio_masked,
107112
)
108113

109114
dict_imputers = {
@@ -142,11 +147,17 @@
142147
# Aotizhongxin
143148

144149
df_plot = df[cols_to_impute]
145-
dfs_imputed = {name: imp.fit_transform(df_plot) for name, imp in dict_imputers.items()}
150+
dfs_imputed = {
151+
name: imp.fit_transform(df_plot) for name, imp in dict_imputers.items()
152+
}
146153
station = "Aotizhongxin"
147154
df_station = df_plot.loc[station]
148-
dfs_imputed_station = {name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()}
149-
fig, axs = plt.subplots(3, 1, sharex=True, figsize=(10, 3 * len(cols_to_impute)))
155+
dfs_imputed_station = {
156+
name: df_plot.loc[station] for name, df_plot in dfs_imputed.items()
157+
}
158+
fig, axs = plt.subplots(
159+
3, 1, sharex=True, figsize=(10, 3 * len(cols_to_impute))
160+
)
150161
for col, ax in zip(cols_to_impute, axs.flatten()):
151162
values_orig = df_station[col]
152163
ax.plot(values_orig, ".", color="black", label="original")
@@ -174,7 +185,9 @@
174185
fig = plt.figure(figsize=(10, 10))
175186
i_plot = 1
176187
for i, col in enumerate(cols_to_impute[:-1]):
177-
for i_imputer, (name_imputer, df_imp) in enumerate(dfs_imputed_station.items()):
188+
for i_imputer, (name_imputer, df_imp) in enumerate(
189+
dfs_imputed_station.items()
190+
):
178191
ax = fig.add_subplot(n_columns, n_imputers, i_plot)
179192
plot.compare_covariances(
180193
df_station,

examples/tutorials/plot_tuto_diffusion_models.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,11 @@
6666
df_data_valid = df_data.iloc[:500]
6767

6868
tabddpm = ImputerDiffusion(
69-
model=TabDDPM(), epochs=10, batch_size=100, x_valid=df_data_valid, print_valid=True
69+
model=TabDDPM(),
70+
epochs=10,
71+
batch_size=100,
72+
x_valid=df_data_valid,
73+
print_valid=True,
7074
)
7175
tabddpm = tabddpm.fit(df_data)
7276

@@ -150,8 +154,12 @@
150154
# reconstruction errors (mae) but increases distribution distance (KL_columnwise).
151155

152156
dict_imputers = {
153-
"num_sampling=5": ImputerDiffusion(model=TabDDPM(num_sampling=5), epochs=10, batch_size=100),
154-
"num_sampling=10": ImputerDiffusion(model=TabDDPM(num_sampling=10), epochs=10, batch_size=100),
157+
"num_sampling=5": ImputerDiffusion(
158+
model=TabDDPM(num_sampling=5), epochs=10, batch_size=100
159+
),
160+
"num_sampling=10": ImputerDiffusion(
161+
model=TabDDPM(num_sampling=10), epochs=10, batch_size=100
162+
),
155163
}
156164

157165
comparison = comparator.Comparator(
@@ -196,7 +204,9 @@
196204
# but requires a longer training/inference time.
197205

198206
dict_imputers = {
199-
"tabddpm": ImputerDiffusion(model=TabDDPM(num_sampling=5), epochs=10, batch_size=100),
207+
"tabddpm": ImputerDiffusion(
208+
model=TabDDPM(num_sampling=5), epochs=10, batch_size=100
209+
),
200210
"tsddpm": ImputerDiffusion(
201211
model=TsDDPM(num_sampling=5, is_rolling=False),
202212
epochs=10,

examples/tutorials/plot_tuto_hole_generator.py

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
It consists in hourly air pollutants data from 12 chinese nationally-controlled
1515
air-quality monitoring sites.
1616
"""
17+
1718
from typing import List
1819

1920
import matplotlib
@@ -49,7 +50,9 @@
4950
# Missing values are in white, while observed ones are in black.
5051

5152
plt.figure(figsize=(15, 4))
52-
plt.imshow(df.notna().values.T, aspect="auto", cmap="binary", interpolation="none")
53+
plt.imshow(
54+
df.notna().values.T, aspect="auto", cmap="binary", interpolation="none"
55+
)
5356
plt.yticks(range(len(df.columns)), df.columns)
5457
plt.xlabel("Samples", fontsize=12)
5558
plt.grid(False)
@@ -96,7 +99,9 @@ def visualise_missing_values(df_init: pd.DataFrame, df_mask: pd.DataFrame):
9699
colorsList = [(0.9, 0, 0), (0, 0, 0), (0.8, 0.8, 0.8)]
97100
custom_cmap = matplotlib.colors.ListedColormap(colorsList)
98101
plt.figure(figsize=(15, 4))
99-
plt.imshow(df_tot.values.T, aspect="auto", cmap=custom_cmap, interpolation="none")
102+
plt.imshow(
103+
df_tot.values.T, aspect="auto", cmap=custom_cmap, interpolation="none"
104+
)
100105
plt.yticks(range(len(df_tot.columns)), df_tot.columns)
101106
plt.xlabel("Samples", fontsize=12)
102107
plt.grid(False)
@@ -156,7 +161,9 @@ def plot_cdf(
156161
_, axs = plt.subplots(1, df.shape[1], sharey=True, figsize=(15, 3))
157162

158163
hole_sizes_original = get_holes_sizes_column_wise(df.to_numpy())
159-
for ind, (hole_original, col) in enumerate(zip(hole_sizes_original, df.columns)):
164+
for ind, (hole_original, col) in enumerate(
165+
zip(hole_sizes_original, df.columns)
166+
):
160167
sorted_data = np.sort(hole_original)
161168
cdf = np.arange(1, len(sorted_data) + 1) / len(sorted_data)
162169
axs[ind].plot(sorted_data, cdf, c="gray", lw=2, label="original")
@@ -166,7 +173,9 @@ def plot_cdf(
166173
array_mask[array_mask == True] = np.nan
167174
hole_sizes_created = get_holes_sizes_column_wise(array_mask.to_numpy())
168175

169-
for ind, (hole_created, col) in enumerate(zip(hole_sizes_created, df.columns)):
176+
for ind, (hole_created, col) in enumerate(
177+
zip(hole_sizes_created, df.columns)
178+
):
170179
sorted_data = np.sort(hole_created)
171180
cdf = np.arange(1, len(sorted_data) + 1) / len(sorted_data)
172181
axs[ind].plot(sorted_data, cdf, c=color, lw=2, label=label)
@@ -309,7 +318,13 @@ def plot_cdf(
309318

310319
plot_cdf(
311320
df,
312-
[uniform_mask, geometric_mask, empirical_mask, multi_markov_mask, grouped_mask],
321+
[
322+
uniform_mask,
323+
geometric_mask,
324+
empirical_mask,
325+
multi_markov_mask,
326+
grouped_mask,
327+
],
313328
["uniform", "geometric", "empirical", "mutli markov", "grouped"],
314329
["tab:orange", "tab:blue", "tab:green", "tab:pink", "tab:olive"],
315330
)

0 commit comments

Comments
 (0)