Skip to content

Commit 08bc72b

Browse files
committed
Fix tests
Signed-off-by: Tushar Goel <[email protected]>
1 parent d345e52 commit 08bc72b

File tree

5 files changed

+116
-33
lines changed

5 files changed

+116
-33
lines changed

vulnerabilities/models.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1360,7 +1360,8 @@ class Meta:
13601360

13611361
def save(self, *args, **kwargs):
13621362
advisory_data = self.to_advisory_data()
1363-
self.unique_content_id = compute_content_id(advisory_data, include_metadata=False)
1363+
if not self.unique_content_id:
1364+
self.unique_content_id = compute_content_id(advisory_data, include_metadata=False)
13641365
super().save(*args, **kwargs)
13651366

13661367
def to_advisory_data(self) -> "AdvisoryData":

vulnerabilities/pipelines/recompute_content_ids.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,14 +92,14 @@ def process_advisories(
9292
for advisory_ids in progress.iter(advisory_batches):
9393
progress.log_progress()
9494
logger.debug(f"{advisory_func.__name__} len={len(advisory_ids)}")
95-
advisory_func(advisory_ids=advisory_ids, logger=None)
95+
advisory_func(advisory_ids=advisory_ids, logger=progress_logger)
9696
return
9797

9898
logger.info(f"Starting ProcessPoolExecutor with {max_workers} max_workers")
9999

100100
with futures.ProcessPoolExecutor(max_workers) as executor:
101101
future_to_advisories = {
102-
executor.submit(advisory_func, advisory_ids, None): advisory_ids
102+
executor.submit(advisory_func, advisory_ids, progress_logger): advisory_ids
103103
for advisory_ids in advisory_batches
104104
}
105105

@@ -187,6 +187,7 @@ def recompute_content_ids(self):
187187
"""
188188
while True:
189189
advisories = Advisory.objects.exclude(unique_content_id__length=64)
190+
print(f"advisories: {advisories.count()}")
190191
if not advisories.exists():
191192
break
192193
process_advisories(

vulnerabilities/pipelines/remove_duplicate_advisories.py

Lines changed: 12 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def remove_duplicates_batch(advisory_ids, logger=None):
2727
try:
2828
with transaction.atomic():
2929
advisories = Advisory.objects.filter(id__in=advisory_ids).select_for_update(
30-
nowait=True, skip_locked=True
30+
skip_locked=True
3131
)
3232
if not advisories.exists():
3333
return
@@ -45,28 +45,25 @@ def remove_duplicates_batch(advisory_ids, logger=None):
4545
if len(group_advisories) <= 1:
4646
continue
4747

48-
if logger:
49-
logger.info(
50-
f"Found {len(group_advisories)} duplicates for content ID {content_id}",
51-
)
48+
logger(
49+
f"Found {len(group_advisories)} duplicates for content ID {content_id}",
50+
)
5251

5352
oldest = min(group_advisories, key=lambda x: x.date_imported)
5453

5554
advisory_ids_to_delete = [adv.id for adv in group_advisories if adv.id != oldest.id]
5655
if advisory_ids_to_delete:
5756
Advisory.objects.filter(id__in=advisory_ids_to_delete).delete()
58-
if logger:
59-
logger.info(
60-
f"Kept advisory {oldest.id} and removed "
61-
f"{len(advisory_ids_to_delete)} duplicates for content ID {content_id}",
62-
)
57+
logger(
58+
f"Kept advisory {oldest.id} and removed "
59+
f"{len(advisory_ids_to_delete)} duplicates for content ID {content_id}",
60+
)
6361

6462
except Exception as e:
65-
if logger:
66-
logger(
67-
f"Error processing batch of advisories: {e}",
68-
level=logging.ERROR,
69-
)
63+
logger(
64+
f"Error processing batch of advisories: {e}",
65+
level=logging.ERROR,
66+
)
7067

7168

7269
class RemoveDuplicateAdvisoriesPipeline(VulnerableCodePipeline):
@@ -91,19 +88,14 @@ def remove_duplicates(self):
9188
.filter(count__gt=1)
9289
.values_list("unique_content_id", flat=True)
9390
)
94-
95-
print(f"duplicate_content_ids: {duplicate_content_ids}")
96-
9791
advisories = Advisory.objects.filter(unique_content_id__in=duplicate_content_ids)
98-
9992
if not advisories.exists():
10093
break
10194

10295
self.log(
10396
f"Processing {advisories.count()} content IDs with duplicates",
10497
level=logging.INFO,
10598
)
106-
10799
process_advisories(
108100
advisories=advisories,
109101
advisory_func=remove_duplicates_batch,

vulnerabilities/tests/test_recompute_content_id.py

Lines changed: 66 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def advisory_data():
4444

4545

4646
@pytest.mark.django_db
47-
def test_recompute_content_ids_basic(advisory_data):
47+
def test_recompute_content_ids_basic_async(advisory_data):
4848
"""
4949
Test that advisories without content IDs get them computed.
5050
"""
@@ -68,7 +68,7 @@ def test_recompute_content_ids_basic(advisory_data):
6868

6969

7070
@pytest.mark.django_db
71-
def test_recompute_content_ids_multiple_batches(advisory_data):
71+
def test_recompute_content_ids_multiple_batches_async(advisory_data):
7272
"""
7373
Test that content ID computation works across multiple batches.
7474
"""
@@ -103,6 +103,66 @@ def test_recompute_content_ids_multiple_batches(advisory_data):
103103
assert Advisory.objects.exclude(unique_content_id__length=64).count() == 0
104104

105105

106+
@pytest.mark.django_db
107+
def test_recompute_content_ids_basic(advisory_data):
108+
"""
109+
Test that advisories without content IDs get them computed.
110+
"""
111+
advisory = Advisory.objects.create(
112+
summary=advisory_data.summary,
113+
affected_packages=[pkg.to_dict() for pkg in advisory_data.affected_packages],
114+
references=[ref.to_dict() for ref in advisory_data.references],
115+
unique_content_id="",
116+
date_collected=datetime.datetime(2024, 1, 1, tzinfo=pytz.UTC),
117+
)
118+
119+
with patch("vulnerabilities.pipelines.recompute_content_ids.get_max_workers") as mock_workers:
120+
mock_workers.return_value = 0
121+
122+
pipeline = RecomputeContentIDPipeline()
123+
pipeline.recompute_content_ids()
124+
125+
advisory.refresh_from_db()
126+
assert advisory.unique_content_id != ""
127+
assert len(advisory.unique_content_id) == 64 # SHA256 hash length
128+
129+
130+
@pytest.mark.django_db
131+
def test_recompute_content_ids_multiple_batches(advisory_data):
132+
"""
133+
Test that content ID computation works across multiple batches.
134+
"""
135+
dates = [
136+
datetime.datetime(
137+
2024 + (i // (12 * 28)), # Year
138+
((i // 28) % 12) + 1, # Month (1-12)
139+
(i % 28) + 1, # Day (1-28)
140+
tzinfo=pytz.UTC,
141+
)
142+
for i in range(2500) # Create 2500 advisories
143+
]
144+
145+
for date in dates:
146+
Advisory.objects.create(
147+
summary=advisory_data.summary,
148+
affected_packages=[pkg.to_dict() for pkg in advisory_data.affected_packages],
149+
references=[ref.to_dict() for ref in advisory_data.references],
150+
unique_content_id="",
151+
date_imported=date,
152+
date_collected=date,
153+
)
154+
155+
with patch("vulnerabilities.pipelines.recompute_content_ids.get_max_workers") as mock_workers:
156+
mock_workers.return_value = 0
157+
158+
pipeline = RecomputeContentIDPipeline()
159+
pipeline.BATCH_SIZE = 1000
160+
pipeline.recompute_content_ids()
161+
162+
assert not Advisory.objects.filter(unique_content_id="").exists()
163+
assert Advisory.objects.exclude(unique_content_id__length=64).count() == 0
164+
165+
106166
@pytest.mark.django_db
107167
def test_recompute_content_ids_preserves_existing(advisory_data):
108168
"""
@@ -137,7 +197,7 @@ def test_recompute_content_ids_error_handling(advisory_data):
137197
summary=advisory_data.summary,
138198
affected_packages=[pkg.to_dict() for pkg in advisory_data.affected_packages],
139199
references=[ref.to_dict() for ref in advisory_data.references],
140-
unique_content_id="",
200+
unique_content_id="Test",
141201
date_collected=datetime.datetime(2024, 1, 1, tzinfo=pytz.UTC),
142202
)
143203

@@ -150,8 +210,6 @@ def test_recompute_content_ids_error_handling(advisory_data):
150210
mock_workers.return_value = 0
151211

152212
pipeline = RecomputeContentIDPipeline()
153-
pipeline.recompute_content_ids()
154-
155-
advisory = Advisory.objects.first()
156-
assert advisory is not None
157-
assert advisory.unique_content_id == ""
213+
# expect an error
214+
with pytest.raises(Exception):
215+
pipeline.recompute_content_ids()

vulnerabilities/tests/test_remove_duplicate_advisories.py

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,39 @@ def test_remove_duplicates_keeps_oldest(advisory_data):
6565
)
6666

6767
with patch("vulnerabilities.pipelines.recompute_content_ids.get_max_workers") as mock_workers:
68-
mock_workers.return_value = 4 # Simulate 4 workers with keep_available=0
68+
mock_workers.return_value = 0 # Simulate 4 workers with keep_available=0
69+
pipeline = RemoveDuplicateAdvisoriesPipeline()
70+
pipeline.remove_duplicates()
71+
72+
# Check that only the oldest advisory remains
73+
remaining = Advisory.objects.all()
74+
assert remaining.count() == 1
75+
assert remaining.first().date_imported == dates[0]
6976

77+
78+
@pytest.mark.django_db
79+
def test_remove_duplicates_keeps_oldest_async(advisory_data):
80+
"""
81+
Test that when multiple advisories have the same content,
82+
only the oldest one is kept.
83+
"""
84+
dates = [
85+
datetime.datetime(2024, 1, 1, tzinfo=pytz.UTC),
86+
datetime.datetime(2024, 1, 2, tzinfo=pytz.UTC),
87+
datetime.datetime(2024, 1, 3, tzinfo=pytz.UTC),
88+
]
89+
90+
for date in dates:
91+
Advisory.objects.create(
92+
summary=advisory_data.summary,
93+
affected_packages=[pkg.to_dict() for pkg in advisory_data.affected_packages],
94+
references=[ref.to_dict() for ref in advisory_data.references],
95+
date_imported=date,
96+
date_collected=date,
97+
)
98+
99+
with patch("vulnerabilities.pipelines.recompute_content_ids.get_max_workers") as mock_workers:
100+
mock_workers.return_value = 4 # Simulate 4 workers with keep_available=0
70101
pipeline = RemoveDuplicateAdvisoriesPipeline()
71102
pipeline.remove_duplicates()
72103

@@ -122,7 +153,7 @@ def test_remove_duplicates_with_multiple_batches(advisory_data):
122153
(i % 28) + 1, # Day (1-28)
123154
tzinfo=pytz.UTC,
124155
)
125-
for i in range(100) # Create 2500 advisories
156+
for i in range(100)
126157
]
127158

128159
for date in dates:

0 commit comments

Comments
 (0)