sdv-dev
diff --git a/‎.github/auto_assign.yml
+1-1 b/‎.github/auto_assign.yml
+1-1
diff --git a/‎HISTORY.md
+37-1 b/‎HISTORY.md
+37-1
diff --git a/‎latest_requirements.txt
+3-3 b/‎latest_requirements.txt
+3-3
diff --git a/‎pyproject.toml
+4-5 b/‎pyproject.toml
+4-5
diff --git a/‎sdv/__init__.py
+1-1 b/‎sdv/__init__.py
+1-1
diff --git a/‎sdv/_utils.py
+36 b/‎sdv/_utils.py
+36
diff --git a/‎sdv/constraints/tabular.py
+5-2 b/‎sdv/constraints/tabular.py
+5-2
diff --git a/‎sdv/constraints/utils.py
+17 b/‎sdv/constraints/utils.py
+17
diff --git a/‎sdv/data_processing/data_processor.py
+13-7 b/‎sdv/data_processing/data_processor.py
+13-7
diff --git a/‎sdv/data_processing/numerical_formatter.py
+11-37 b/‎sdv/data_processing/numerical_formatter.py
+11-37
diff --git a/‎sdv/evaluation/multi_table.py
+4-4 b/‎sdv/evaluation/multi_table.py
+4-4
diff --git a/‎sdv/logging/logger.py
+1-1 b/‎sdv/logging/logger.py
+1-1
@@ -1,2 +1,2 @@
 # Set to true to add assignees to pull requests
-addAssignees: true
+addAssignees: author
@@ -1,6 +1,42 @@
 # Release Notes
 
-### v1.15.0 - 2024-07-11
+### v1.16.0 - 2024-08-22
+
+This release enables the `HMASynthesizer` and other utility functions to work with null foreign key values! It also adds an `anonymization` method to the metadata classes. Additionally, it patches a bug that lets SDV work with more Pandas data types.
+
+### New Features
+
+* Add metadata anonymization to public SDV - Issue [#2137](https://github.com/sdv-dev/SDV/issues/2137) by @R-Palazzo
+* Switch drop_missing_values in in drop_unknown_references to support null foreign keys by default - Issue [#2076](https://github.com/sdv-dev/SDV/issues/2076) by @R-Palazzo
+* Support nullable foreign keys in HMA - Issue [#2063](https://github.com/sdv-dev/SDV/issues/2063) by @rwedge
+* Remove input error from base synthesizer class once nullable foreign keys are supported - Issue [#2057](https://github.com/sdv-dev/SDV/issues/2057) by @rwedge
+* Support null foreign keys in get_random_subset - Issue [#2056](https://github.com/sdv-dev/SDV/issues/2056) by @R-Palazzo
+* Warn the user if they are trying to save an unfit synthesizer - Issue [#1961](https://github.com/sdv-dev/SDV/issues/1961) by @fealho
+
+### Bugs Fixed
+
+* Using FixedCombinations constraint with an integer constraint column causes sampling to fail - Issue [#2183](https://github.com/sdv-dev/SDV/issues/2183) by @R-Palazzo
+* Metadata Detection Fails with new Data Type - Issue [#2182](https://github.com/sdv-dev/SDV/issues/2182) by @R-Palazzo
+* Unable visualize just the real data (or just the synthetic data) in a multi-table setting - Issue [#2160](https://github.com/sdv-dev/SDV/issues/2160) by @R-Palazzo
+* [dtypes] Numerical Formatter Fails to Learn Format of New Data Types - Issue [#2156](https://github.com/sdv-dev/SDV/issues/2156) by @R-Palazzo
+* Primary keys may not be unique for variable length regexes - Issue [#2116](https://github.com/sdv-dev/SDV/issues/2116) by @amontanez24
+* Confusing warning when using GANs that suggests that CUDA isn't being used - Issue [#2052](https://github.com/sdv-dev/SDV/issues/2052) by @fealho
+* PAR DiagnosticReport not 1.0 with float categorical columns - Issue [#1910](https://github.com/sdv-dev/SDV/issues/1910) by @lajohn4747
+* In `PARSynthesizer` I cannot pass in datetime context (`InvalidDataError` during fitting) - Issue [#1485](https://github.com/sdv-dev/SDV/issues/1485) by @lajohn4747
+
+### Internal
+
+* Enabling sdv logging causes tests to fail locally - Issue [#2162](https://github.com/sdv-dev/SDV/issues/2162) by @amontanez24
+* Separate primary key detection functionality - Issue [#2101](https://github.com/sdv-dev/SDV/issues/2101) by @amontanez24
+
+### Maintenance
+
+* [dtypes] Update the NumericalFormatter to use the `learn_rounding_digits` from RDT - Issue [#2164](https://github.com/sdv-dev/SDV/issues/2164) by @R-Palazzo
+* Mock every usage of `is_faker_function` to speed up the unit tests - Issue [#2163](https://github.com/sdv-dev/SDV/issues/2163) by @R-Palazzo
+* Review docs-related dev dependencies - Issue [#2148](https://github.com/sdv-dev/SDV/issues/2148) by @rwedge
+* Cap boto and botocore - Issue [#2123](https://github.com/sdv-dev/SDV/issues/2123) by @lajohn4747
+
+## v1.15.0 - 2024-07-11
 
 This release adds a new utils function called `get_random_sequence_subset`, that allows users to get a subset of sequential data.
 
 
@@ -6,6 +6,6 @@ graphviz==0.20.3
 numpy==1.26.4
 pandas==2.2.2
 platformdirs==4.2.2
-rdt==1.12.1
-sdmetrics==0.14.1
-tqdm==4.66.4
+rdt==1.12.3
+sdmetrics==0.15.1
+tqdm==4.66.5
@@ -21,8 +21,8 @@ license = { text = 'BSL-1.1' }
 requires-python = '>=3.8,<3.13'
 readme = 'README.md'
 dependencies = [
-    'boto3>=1.28',
-    'botocore>=1.31',
+    'boto3>=1.28,<2.0.0',
+    'botocore>=1.31,<2.0.0',
     'cloudpickle>=2.1.0',
     'graphviz>=0.13.2',
     "numpy>=1.21.0,<2.0.0;python_version<'3.10'",
@@ -35,7 +35,7 @@ dependencies = [
     'copulas>=0.11.0',
     'ctgan>=0.10.0',
     'deepecho>=0.6.0',
-    'rdt>=1.12.0',
+    'rdt>=1.12.3',
     'sdmetrics>=0.14.0',
     'platformdirs>=4.0',
     'pyyaml>=6.0.1',
@@ -75,7 +75,6 @@ dev = [
 
     # docs
     'docutils>=0.12,<1',
-    'm2r2>=0.2.5,<1',
     'nbsphinx>=0.5.0,<1',
     'sphinx_toolbox>=2.5,<4',
     'Sphinx>=3,<8',
@@ -133,7 +132,7 @@ namespaces = false
 version = {attr = 'sdv.__version__'}
 
 [tool.bumpversion]
-current_version = "1.15.0"
+current_version = "1.16.0.dev1"
 parse = '(?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\.(?P<release>[a-z]+)(?P<candidate>\d+))?'
 serialize = [
     '{major}.{minor}.{patch}.{release}{candidate}',
 
@@ -6,7 +6,7 @@
 
 __author__ = 'DataCebo, Inc.'
 __email__ = '[email protected]'
-__version__ = '1.15.0'
+__version__ = '1.16.0.dev1'
 
 
 import sys
 
@@ -10,10 +10,16 @@
 
 import pandas as pd
 from pandas.core.tools.datetimes import _guess_datetime_format_for_array
+from rdt.transformers.utils import _GENERATORS
 
 from sdv import version
 from sdv.errors import SDVVersionWarning, SynthesizerInputError, VersionError
 
+try:
+    from re import _parser as sre_parse
+except ImportError:
+    import sre_parse
+
 
 def _cast_to_iterable(value):
     """Return a ``list`` if the input object is not a ``list`` or ``tuple``."""
@@ -403,3 +409,33 @@ def generate_synthesizer_id(synthesizer):
     synth_version = version.public
     unique_id = ''.join(str(uuid.uuid4()).split('-'))
     return f'{class_name}_{synth_version}_{unique_id}'
+
+
+def _get_chars_for_option(option, params):
+    if option not in _GENERATORS:
+        raise ValueError(f'REGEX operation: {option} is not supported by SDV.')
+
+    if option == sre_parse.MAX_REPEAT:
+        new_option, new_params = params[2][0]  # The value at the second index is the nested option
+        return _get_chars_for_option(new_option, new_params)
+
+    return list(_GENERATORS[option](params, 1)[0])
+
+
+def get_possible_chars(regex, num_subpatterns=None):
+    """Get the list of possible characters a regex can create.
+
+    Args:
+        regex (str):
+            The regex to parse.
+        num_subpatterns (int):
+            The number of sub-patterns from the regex to find characters for.
+    """
+    parsed = sre_parse.parse(regex)
+    parsed = [p for p in parsed if p[0] != sre_parse.AT]
+    num_subpatterns = num_subpatterns or len(parsed)
+    possible_chars = []
+    for option, params in parsed[:num_subpatterns]:
+        possible_chars += _get_chars_for_option(option, params)
+
+    return possible_chars
@@ -48,6 +48,7 @@
     cast_to_datetime64,
     compute_nans_column,
     get_datetime_diff,
+    get_mappable_combination,
     logit,
     matches_datetime_format,
     revert_nans_columns,
@@ -297,9 +298,10 @@ def _fit(self, table_data):
         self._combinations_to_uuids = {}
         self._uuids_to_combinations = {}
         for combination in self._combinations.itertuples(index=False, name=None):
+            mappable_combination = get_mappable_combination(combination)
             uuid_str = str(uuid.uuid4())
-            self._combinations_to_uuids[combination] = uuid_str
-            self._uuids_to_combinations[uuid_str] = combination
+            self._combinations_to_uuids[mappable_combination] = uuid_str
+            self._uuids_to_combinations[uuid_str] = mappable_combination
 
     def is_valid(self, table_data):
         """Say whether the column values are within the original combinations.
@@ -333,6 +335,7 @@ def _transform(self, table_data):
             pandas.DataFrame:
                 Transformed data.
         """
+        table_data[self._columns] = table_data[self._columns].replace({np.nan: None})
         combinations = table_data[self._columns].itertuples(index=False, name=None)
         uuids = map(self._combinations_to_uuids.get, combinations)
         table_data[self._joint_column] = list(uuids)
 
@@ -204,3 +204,20 @@ def get_datetime_diff(high, low, high_datetime_format=None, low_datetime_format=
     diff_column = diff_column.astype(np.float64)
     diff_column[nan_mask] = np.nan
     return diff_column
+
+
+def get_mappable_combination(combination):
+    """Get a mappable combination of values.
+
+    This function replaces NaN values with None inside the tuple
+    to ensure consistent comparisons when using mapping.
+
+    Args:
+        combination (tuple):
+            A combination of values.
+
+    Returns:
+        tuple:
+            A mappable combination of values.
+    """
+    return tuple(None if pd.isna(x) else x for x in combination)
@@ -9,6 +9,7 @@
 import pandas as pd
 import rdt
 from pandas.api.types import is_float_dtype, is_integer_dtype
+from pandas.errors import IntCastingNaNError
 from rdt.transformers import AnonymizedFaker, get_default_transformers
 from rdt.transformers.pii.anonymization import get_anonymized_transformer
 
@@ -902,18 +903,23 @@ def reverse_transform(self, data, reset_keys=False):
             reversed_data[column_name] = column_data[column_data.notna()]
             try:
                 reversed_data[column_name] = reversed_data[column_name].astype(dtype)
-            except ValueError as e:
+            except (IntCastingNaNError, ValueError) as e:
+                message = (
+                    f"The real data in '{column_name}' was stored as '{dtype}' but the "
+                    'synthetic data could not be cast back to this type. If this is a '
+                    'problem, please check your input data and metadata settings.'
+                )
+                if isinstance(e, IntCastingNaNError):
+                    LOGGER.debug(message)
+                    continue
+
+                # Handle the ValueError case
                 column_metadata = self.metadata.columns.get(column_name)
                 sdtype = column_metadata.get('sdtype')
                 if sdtype not in self._DTYPE_TO_SDTYPE.values():
-                    LOGGER.info(
-                        f"The real data in '{column_name}' was stored as '{dtype}' but the "
-                        'synthetic data could not be cast back to this type. If this is a '
-                        'problem, please check your input data and metadata settings.'
-                    )
+                    LOGGER.info(message)
                     if column_name in self.formatters:
                         self.formatters.pop(column_name)
-
                 else:
                     raise ValueError(e)
 
 
@@ -3,8 +3,8 @@
 import logging
 import sys
 
-import numpy as np
 import pandas as pd
+from rdt.transformers.utils import learn_rounding_digits
 
 LOGGER = logging.getLogger(__name__)
 
@@ -51,34 +51,6 @@ def __init__(
         self.enforce_min_max_values = enforce_min_max_values
         self.computer_representation = computer_representation
 
-    @staticmethod
-    def _learn_rounding_digits(data):
-        """Check if data has any decimals."""
-        name = data.name
-        data = np.array(data)
-        roundable_data = data[~(np.isinf(data) | pd.isna(data))]
-
-        # Doesn't contain numbers
-        if len(roundable_data) == 0:
-            return None
-
-        # Doesn't contain decimal digits
-        if ((roundable_data % 1) == 0).all():
-            return 0
-
-        # Try to round to fewer digits
-        if (roundable_data == roundable_data.round(MAX_DECIMALS)).all():
-            for decimal in range(MAX_DECIMALS + 1):
-                if (roundable_data == roundable_data.round(decimal)).all():
-                    return decimal
-
-        # Can't round, not equal after MAX_DECIMALS digits of precision
-        LOGGER.info(
-            f"No rounding scheme detected for column '{name}'."
-            ' Synthetic data will not be rounded.'
-        )
-        return None
-
     def learn_format(self, column):
         """Learn the format of a column.
 
@@ -92,7 +64,7 @@ def learn_format(self, column):
             self._max_value = column.max()
 
         if self.enforce_rounding:
-            self._rounding_digits = self._learn_rounding_digits(column)
+            self._rounding_digits = learn_rounding_digits(column)
 
     def format_data(self, column):
         """Format a column according to the learned format.
@@ -105,20 +77,22 @@ def format_data(self, column):
             numpy.ndarray:
                 containing the formatted data.
         """
-        column = column.copy().to_numpy()
+        column = column.copy()
         if self.enforce_min_max_values:
             column = column.clip(self._min_value, self._max_value)
-        elif self.computer_representation != 'Float':
+        elif not self.computer_representation.startswith('Float'):
             min_bound, max_bound = INTEGER_BOUNDS[self.computer_representation]
             column = column.clip(min_bound, max_bound)
 
-        is_integer = np.dtype(self._dtype).kind == 'i'
+        is_integer = pd.api.types.is_integer_dtype(self._dtype)
+        np_integer_with_nans = (
+            not pd.api.types.is_extension_array_dtype(self._dtype)
+            and is_integer
+            and pd.isna(column).any()
+        )
         if self.enforce_rounding and self._rounding_digits is not None:
             column = column.round(self._rounding_digits)
         elif is_integer:
             column = column.round(0)
 
-        if pd.isna(column).any() and is_integer:
-            return column
-
-        return column.astype(self._dtype)
+        return column.astype(self._dtype if not np_integer_with_nans else 'float64')
@@ -77,8 +77,8 @@ def get_column_plot(real_data, synthetic_data, metadata, table_name, column_name
             1D marginal distribution plot (i.e. a histogram) of the columns.
     """
     metadata = metadata.tables[table_name]
-    real_data = real_data[table_name]
-    synthetic_data = synthetic_data[table_name]
+    real_data = real_data[table_name] if real_data else None
+    synthetic_data = synthetic_data[table_name] if synthetic_data else None
     return single_table_visualization.get_column_plot(
         real_data,
         synthetic_data,
@@ -118,8 +118,8 @@ def get_column_pair_plot(
             2D bivariate distribution plot (i.e. a scatterplot) of the columns.
     """
     metadata = metadata.tables[table_name]
-    real_data = real_data[table_name]
-    synthetic_data = synthetic_data[table_name]
+    real_data = real_data[table_name] if real_data else None
+    synthetic_data = synthetic_data[table_name] if synthetic_data else None
     return single_table_visualization.get_column_pair_plot(
         real_data, synthetic_data, metadata, column_names, sample_size, plot_type
     )
 
@@ -33,7 +33,7 @@ def __init__(self, filename=None):
 
     def format(self, record):  # noqa: A003
         """Format the record and write to CSV."""
-        row = record.msg
+        row = record.msg.copy()
         row['LEVEL'] = record.levelname
         self.writer.writerow(row)
         data = self.output.getvalue()
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`# Set to true to add assignees to pull requests`
`2`		`-addAssignees: true`
	`2`	`+addAssignees: author`