Skip to content

Commit fceac08

Browse files
authored
Ensure HMA diagnostic report is 1.0 for integer primary/foreign key with very large values (#2314)
1 parent 47bc8d4 commit fceac08

File tree

7 files changed

+503
-7
lines changed

7 files changed

+503
-7
lines changed

sdv/data_processing/data_processor.py

+10
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,7 @@ def __init__(
138138
self.fitted = False
139139
self.formatters = {}
140140
self._primary_key = self.metadata.primary_key
141+
self._warned_overflow = False
141142
self._prepared_for_fitting = False
142143
self._keys = deepcopy(self.metadata.alternate_keys)
143144
if self._primary_key:
@@ -934,6 +935,15 @@ def reverse_transform(self, data, reset_keys=False):
934935
self.formatters.pop(column_name)
935936
else:
936937
raise ValueError(e)
938+
except OverflowError:
939+
if not self._warned_overflow:
940+
warnings.warn(
941+
f"The real data in '{self.table_name}' and column '{column_name}' was "
942+
f"stored as '{dtype}' but the synthetic data overflowed when casting back "
943+
'to this type. If this is a problem, please check your input data '
944+
'and metadata settings.'
945+
)
946+
self._warned_overflow = True
937947

938948
# reformat columns using the formatters
939949
for column in sampled_columns:

sdv/sampling/hierarchical_sampler.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ def _sample_children(self, table_name, sampled_data, scale=1.0):
207207
self._enforce_table_size(child_name, table_name, scale, sampled_data)
208208

209209
if child_name not in sampled_data: # Sample based on only 1 parent
210-
for _, row in sampled_data[table_name].iterrows():
210+
for _, row in sampled_data[table_name].astype(object).iterrows():
211211
self._add_child_rows(
212212
child_name=child_name,
213213
parent_name=table_name,
@@ -219,7 +219,9 @@ def _sample_children(self, table_name, sampled_data, scale=1.0):
219219

220220
if child_name not in sampled_data: # No child rows sampled, force row creation
221221
num_rows_key = f'__{child_name}__{foreign_key}__num_rows'
222-
max_num_child_index = sampled_data[table_name][num_rows_key].idxmax()
222+
max_num_child_index = pd.to_numeric(
223+
sampled_data[table_name][num_rows_key], errors='coerce'
224+
).idxmax()
223225
parent_row = sampled_data[table_name].iloc[max_num_child_index]
224226

225227
self._add_child_rows(

sdv/sampling/independent_sampler.py

+7
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,13 @@ def _finalize(self, sampled_data):
118118

119119
else:
120120
raise ValueError(e)
121+
except OverflowError:
122+
LOGGER.debug(
123+
f"The real data in '{table_name}' and column '{name}' was stored as "
124+
f"'{dtype}' but the synthetic data overflowed when casting back to "
125+
'this type. If this is a problem, please check your input data '
126+
'and metadata settings.'
127+
)
121128

122129
final_data[table_name] = table_rows[list(dtypes.keys())]
123130

0 commit comments

Comments
 (0)