|
5 | 5 |
|
6 | 6 | from sdv.cag import FixedCombinations
|
7 | 7 | from sdv.metadata import Metadata
|
| 8 | +from sdv.single_table.copulas import GaussianCopulaSynthesizer |
8 | 9 | from tests.utils import run_pattern
|
9 | 10 |
|
10 | 11 |
|
@@ -117,3 +118,221 @@ def test_fixed_null_combinations_with_multi_table():
|
117 | 118 | assert set(data.keys()) == set(reverse_transformed.keys())
|
118 | 119 | for table_name, table in data.items():
|
119 | 120 | pd.testing.assert_frame_equal(table, reverse_transformed[table_name])
|
| 121 | + |
| 122 | + |
| 123 | +def test_fixed_combinations_multiple_patterns(): |
| 124 | + """Test that FixedCombinations pattern works with multiple patterns.""" |
| 125 | + # Setup |
| 126 | + data = pd.DataFrame({ |
| 127 | + 'A': [1, 2, 3, 1, 2, 1], |
| 128 | + 'B': [10, 20, 30, 10, 20, 10], |
| 129 | + 'C': [100, 200, 300, 100, 200, 100], |
| 130 | + 'D': [1000, 2000, 3000, 1000, 2000, 1000], |
| 131 | + }) |
| 132 | + metadata = Metadata.load_from_dict({ |
| 133 | + 'columns': { |
| 134 | + 'A': {'sdtype': 'categorical'}, |
| 135 | + 'B': {'sdtype': 'categorical'}, |
| 136 | + 'C': {'sdtype': 'categorical'}, |
| 137 | + 'D': {'sdtype': 'categorical'}, |
| 138 | + } |
| 139 | + }) |
| 140 | + pattern1 = FixedCombinations(['A', 'B']) |
| 141 | + pattern2 = FixedCombinations(['C', 'D']) |
| 142 | + |
| 143 | + # Run |
| 144 | + synthesizer = GaussianCopulaSynthesizer(metadata) |
| 145 | + synthesizer.add_cag(patterns=[pattern1, pattern2]) |
| 146 | + synthesizer.fit(data) |
| 147 | + samples = synthesizer.sample(100) |
| 148 | + updated_metadata = synthesizer.get_metadata('modified') |
| 149 | + original_metadata = synthesizer.get_metadata('original') |
| 150 | + |
| 151 | + # Assert |
| 152 | + expected_updated_metadata = Metadata.load_from_dict({ |
| 153 | + 'columns': { |
| 154 | + 'A#B': {'sdtype': 'categorical'}, |
| 155 | + 'C#D': {'sdtype': 'categorical'}, |
| 156 | + } |
| 157 | + }).to_dict() |
| 158 | + assert expected_updated_metadata == updated_metadata.to_dict() |
| 159 | + |
| 160 | + assert original_metadata.to_dict() == metadata.to_dict() |
| 161 | + |
| 162 | + # Get unique combinations from original data |
| 163 | + original_ab_combos = set(zip(data['A'], data['B'])) |
| 164 | + original_cd_combos = set(zip(data['C'], data['D'])) |
| 165 | + |
| 166 | + # Get unique combinations from synthetic data |
| 167 | + synthetic_ab_combos = set(zip(samples['A'], samples['B'])) |
| 168 | + synthetic_cd_combos = set(zip(samples['C'], samples['D'])) |
| 169 | + |
| 170 | + # Assert combinations match |
| 171 | + assert original_ab_combos == synthetic_ab_combos |
| 172 | + assert original_cd_combos == synthetic_cd_combos |
| 173 | + |
| 174 | + |
| 175 | +def test_fixed_combinations_multiple_patterns_reject_sampling(): |
| 176 | + """Test that FixedCombinations pattern works with multiple patterns and reject sampling.""" |
| 177 | + # Setup |
| 178 | + data = pd.DataFrame({ |
| 179 | + 'A': [1, 2, 3, 1, 2, 1], |
| 180 | + 'B': [10, 20, 30, 10, 20, 10], |
| 181 | + 'C': [100, 200, 300, 100, 200, 100], |
| 182 | + }) |
| 183 | + metadata = Metadata.load_from_dict({ |
| 184 | + 'columns': { |
| 185 | + 'A': {'sdtype': 'categorical'}, |
| 186 | + 'B': {'sdtype': 'categorical'}, |
| 187 | + 'C': {'sdtype': 'categorical'}, |
| 188 | + } |
| 189 | + }) |
| 190 | + pattern1 = FixedCombinations(['A', 'B']) |
| 191 | + pattern2 = FixedCombinations(['A', 'C']) |
| 192 | + |
| 193 | + # Run |
| 194 | + synthesizer = GaussianCopulaSynthesizer(metadata) |
| 195 | + synthesizer.add_cag(patterns=[pattern1, pattern2]) |
| 196 | + synthesizer.fit(data) |
| 197 | + samples = synthesizer.sample(100) |
| 198 | + updated_metadata = synthesizer.get_metadata('modified') |
| 199 | + original_metadata = synthesizer.get_metadata('original') |
| 200 | + |
| 201 | + # Assert |
| 202 | + expected_updated_metadata = Metadata.load_from_dict({ |
| 203 | + 'columns': { |
| 204 | + 'A#B': {'sdtype': 'categorical'}, |
| 205 | + 'C': {'sdtype': 'categorical'}, |
| 206 | + } |
| 207 | + }).to_dict() |
| 208 | + assert expected_updated_metadata == updated_metadata.to_dict() |
| 209 | + |
| 210 | + assert original_metadata.to_dict() == metadata.to_dict() |
| 211 | + |
| 212 | + # Get unique combinations from original data |
| 213 | + original_ab_combos = set(zip(data['A'], data['B'])) |
| 214 | + original_ac_combos = set(zip(data['A'], data['C'])) |
| 215 | + |
| 216 | + # Get unique combinations from synthetic data |
| 217 | + synthetic_ab_combos = set(zip(samples['A'], samples['B'])) |
| 218 | + synthetic_ac_combos = set(zip(samples['A'], samples['C'])) |
| 219 | + |
| 220 | + # Assert combinations match |
| 221 | + assert original_ab_combos == synthetic_ab_combos |
| 222 | + assert original_ac_combos == synthetic_ac_combos |
| 223 | + |
| 224 | + |
| 225 | +def test_fixed_combinations_multiple_patterns_three_patterns(): |
| 226 | + """Test that FixedCombinations pattern works with multiple patterns.""" |
| 227 | + # Setup |
| 228 | + data = pd.DataFrame({ |
| 229 | + 'A': [1, 2, 3, 1, 2, 1], |
| 230 | + 'B': [10, 20, 30, 10, 20, 10], |
| 231 | + 'C': [100, 200, 300, 100, 200, 100], |
| 232 | + 'D': [1000, 2000, 3000, 1000, 2000, 1000], |
| 233 | + }) |
| 234 | + metadata = Metadata.load_from_dict({ |
| 235 | + 'columns': { |
| 236 | + 'A': {'sdtype': 'categorical'}, |
| 237 | + 'B': {'sdtype': 'categorical'}, |
| 238 | + 'C': {'sdtype': 'categorical'}, |
| 239 | + 'D': {'sdtype': 'categorical'}, |
| 240 | + } |
| 241 | + }) |
| 242 | + pattern1 = FixedCombinations(['A', 'B']) |
| 243 | + pattern2 = FixedCombinations(['C', 'D']) |
| 244 | + pattern3 = FixedCombinations(['A', 'C']) |
| 245 | + |
| 246 | + # Run |
| 247 | + synthesizer = GaussianCopulaSynthesizer(metadata) |
| 248 | + synthesizer.add_cag(patterns=[pattern1, pattern2, pattern3]) |
| 249 | + synthesizer.fit(data) |
| 250 | + samples = synthesizer.sample(100) |
| 251 | + updated_metadata = synthesizer.get_metadata('modified') |
| 252 | + original_metadata = synthesizer.get_metadata('original') |
| 253 | + |
| 254 | + # Assert |
| 255 | + expected_updated_metadata = Metadata.load_from_dict({ |
| 256 | + 'columns': { |
| 257 | + 'A#B': {'sdtype': 'categorical'}, |
| 258 | + 'C#D': {'sdtype': 'categorical'}, |
| 259 | + } |
| 260 | + }).to_dict() |
| 261 | + assert expected_updated_metadata == updated_metadata.to_dict() |
| 262 | + |
| 263 | + assert original_metadata.to_dict() == metadata.to_dict() |
| 264 | + |
| 265 | + # Get unique combinations from original data |
| 266 | + original_ab_combos = set(zip(data['A'], data['B'])) |
| 267 | + original_cd_combos = set(zip(data['C'], data['D'])) |
| 268 | + original_ac_combos = set(zip(data['A'], data['C'])) |
| 269 | + |
| 270 | + # Get unique combinations from synthetic data |
| 271 | + synthetic_ab_combos = set(zip(samples['A'], samples['B'])) |
| 272 | + synthetic_cd_combos = set(zip(samples['C'], samples['D'])) |
| 273 | + synthetic_ac_combos = set(zip(samples['A'], samples['C'])) |
| 274 | + |
| 275 | + # Assert combinations match |
| 276 | + assert original_ab_combos == synthetic_ab_combos |
| 277 | + assert original_cd_combos == synthetic_cd_combos |
| 278 | + assert original_ac_combos == synthetic_ac_combos |
| 279 | + |
| 280 | + |
| 281 | +def test_fixed_combinations_multiple_patterns_three_patterns_reject_sampling(): |
| 282 | + """Test that FixedCombinations pattern works with multiple patterns. |
| 283 | +
|
| 284 | + Test that when the second pattern in the chain fails, the third pattern still works. |
| 285 | + """ |
| 286 | + # Setup |
| 287 | + data = pd.DataFrame({ |
| 288 | + 'A': [1, 2, 3, 1, 2, 1], |
| 289 | + 'B': [10, 20, 30, 10, 20, 10], |
| 290 | + 'C': [100, 200, 300, 100, 200, 100], |
| 291 | + 'D': [1000, 2000, 3000, 1000, 2000, 1000], |
| 292 | + }) |
| 293 | + metadata = Metadata.load_from_dict({ |
| 294 | + 'columns': { |
| 295 | + 'A': {'sdtype': 'categorical'}, |
| 296 | + 'B': {'sdtype': 'categorical'}, |
| 297 | + 'C': {'sdtype': 'categorical'}, |
| 298 | + 'D': {'sdtype': 'categorical'}, |
| 299 | + } |
| 300 | + }) |
| 301 | + pattern1 = FixedCombinations(['A', 'B']) |
| 302 | + pattern2 = FixedCombinations(['C', 'D']) |
| 303 | + pattern3 = FixedCombinations(['A', 'C']) |
| 304 | + |
| 305 | + # Run |
| 306 | + synthesizer = GaussianCopulaSynthesizer(metadata) |
| 307 | + synthesizer.add_cag(patterns=[pattern1, pattern3, pattern2]) |
| 308 | + synthesizer.fit(data) |
| 309 | + samples = synthesizer.sample(100) |
| 310 | + updated_metadata = synthesizer.get_metadata('modified') |
| 311 | + original_metadata = synthesizer.get_metadata('original') |
| 312 | + |
| 313 | + # Assert |
| 314 | + expected_updated_metadata = Metadata.load_from_dict({ |
| 315 | + 'columns': { |
| 316 | + 'A#B': {'sdtype': 'categorical'}, |
| 317 | + 'C#D': {'sdtype': 'categorical'}, |
| 318 | + } |
| 319 | + }).to_dict() |
| 320 | + |
| 321 | + assert expected_updated_metadata == updated_metadata.to_dict() |
| 322 | + |
| 323 | + assert original_metadata.to_dict() == metadata.to_dict() |
| 324 | + |
| 325 | + # Get unique combinations from original data |
| 326 | + original_ab_combos = set(zip(data['A'], data['B'])) |
| 327 | + original_cd_combos = set(zip(data['C'], data['D'])) |
| 328 | + original_ac_combos = set(zip(data['A'], data['C'])) |
| 329 | + |
| 330 | + # Get unique combinations from synthetic data |
| 331 | + synthetic_ab_combos = set(zip(samples['A'], samples['B'])) |
| 332 | + synthetic_cd_combos = set(zip(samples['C'], samples['D'])) |
| 333 | + synthetic_ac_combos = set(zip(samples['A'], samples['C'])) |
| 334 | + |
| 335 | + # Assert combinations match |
| 336 | + assert original_ab_combos == synthetic_ab_combos |
| 337 | + assert original_cd_combos == synthetic_cd_combos |
| 338 | + assert original_ac_combos == synthetic_ac_combos |
0 commit comments