From f43faa97424d3ed8382248b495f19f541546216c Mon Sep 17 00:00:00 2001 From: kunalsz Date: Wed, 2 Apr 2025 06:15:51 -0400 Subject: [PATCH] Replace paginated with iterator Signed-off-by: kunalsz --- vulnerabilities/improvers/default.py | 4 ++-- vulnerabilities/improvers/valid_versions.py | 4 ++-- vulnerabilities/management/commands/export.py | 2 +- vulnerabilities/models.py | 14 -------------- vulnerabilities/pipelines/__init__.py | 2 +- vulnerabilities/pipelines/add_cvss31_to_CVEs.py | 2 +- vulnerabilities/pipelines/collect_commits.py | 2 +- vulnerabilities/pipelines/compute_package_risk.py | 6 ++++-- vulnerabilities/pipelines/flag_ghost_packages.py | 2 +- 9 files changed, 13 insertions(+), 25 deletions(-) diff --git a/vulnerabilities/improvers/default.py b/vulnerabilities/improvers/default.py index f2e9009e8..7e1a90fda 100644 --- a/vulnerabilities/improvers/default.py +++ b/vulnerabilities/improvers/default.py @@ -44,9 +44,9 @@ def interesting_advisories(self) -> QuerySet: return ( Advisory.objects.filter(Q(created_by=self.importer.qualified_name)) .order_by("-date_collected") - .paginated() + .iterator() ) - return Advisory.objects.all().order_by("-date_collected").paginated() + return Advisory.objects.all().order_by("-date_collected").iterator() def get_inferences(self, advisory_data: AdvisoryData) -> Iterable[Inference]: if not advisory_data: diff --git a/vulnerabilities/improvers/valid_versions.py b/vulnerabilities/improvers/valid_versions.py index 916f36f59..8791020b4 100644 --- a/vulnerabilities/improvers/valid_versions.py +++ b/vulnerabilities/improvers/valid_versions.py @@ -64,8 +64,8 @@ class ValidVersionImprover(Improver): @property def interesting_advisories(self) -> QuerySet: if issubclass(self.importer, VulnerableCodeBaseImporterPipeline): - return Advisory.objects.filter(Q(created_by=self.importer.pipeline_id)).paginated() - return Advisory.objects.filter(Q(created_by=self.importer.qualified_name)).paginated() + return Advisory.objects.filter(Q(created_by=self.importer.pipeline_id)).iterator() + return Advisory.objects.filter(Q(created_by=self.importer.qualified_name)).iterator() def get_package_versions( self, package_url: PackageURL, until: Optional[datetime] = None diff --git a/vulnerabilities/management/commands/export.py b/vulnerabilities/management/commands/export.py index 08685e33d..0da38bbdc 100644 --- a/vulnerabilities/management/commands/export.py +++ b/vulnerabilities/management/commands/export.py @@ -159,7 +159,7 @@ def packages_by_type_ns_name(): "fixing_vulnerabilities__weaknesses", "fixing_vulnerabilities__severities", ) - .paginated() + .iterator() ) for tp_ns_name, packages in groupby(qs, key=by_purl_type_ns_name): diff --git a/vulnerabilities/models.py b/vulnerabilities/models.py index dba205500..920ea3727 100644 --- a/vulnerabilities/models.py +++ b/vulnerabilities/models.py @@ -75,20 +75,6 @@ def get_or_none(self, *args, **kwargs): with suppress(self.model.DoesNotExist, ValidationError): return self.get(*args, **kwargs) - def paginated(self, per_page=5000): - """ - Iterate over a (large) QuerySet by chunks of ``per_page`` items. - This technique is essential for preventing memory issues when iterating - See these links for inspiration: - https://nextlinklabs.com/resources/insights/django-big-data-iteration - https://stackoverflow.com/questions/4222176/why-is-iterating-through-a-large-django-queryset-consuming-massive-amounts-of-me/ - """ - paginator = Paginator(self, per_page=per_page) - for page_number in paginator.page_range: - page = paginator.page(page_number) - for obj in page.object_list: - yield obj - class VulnerabilityQuerySet(BaseQuerySet): def affecting_vulnerabilities(self): diff --git a/vulnerabilities/pipelines/__init__.py b/vulnerabilities/pipelines/__init__.py index d74db9f35..1b3613933 100644 --- a/vulnerabilities/pipelines/__init__.py +++ b/vulnerabilities/pipelines/__init__.py @@ -170,7 +170,7 @@ def import_new_advisories(self): imported_advisory_count = 0 progress = LoopProgress(total_iterations=new_advisories_count, logger=self.log) - for advisory in progress.iter(new_advisories.paginated()): + for advisory in progress.iter(new_advisories.iterator()): self.import_advisory(advisory=advisory) if advisory.date_imported: imported_advisory_count += 1 diff --git a/vulnerabilities/pipelines/add_cvss31_to_CVEs.py b/vulnerabilities/pipelines/add_cvss31_to_CVEs.py index a9791d29c..d10d0a6c1 100644 --- a/vulnerabilities/pipelines/add_cvss31_to_CVEs.py +++ b/vulnerabilities/pipelines/add_cvss31_to_CVEs.py @@ -48,7 +48,7 @@ def process_cve_advisory_mapping(self): batch_size = 1000 results = [] - for severity in progress.iter(nvd_severities.paginated(per_page=batch_size)): + for severity in progress.iter(nvd_severities.iterator(chunk_size=batch_size)): cve_pattern = re.compile(r"(CVE-\d{4}-\d{4,7})").search cve_match = cve_pattern(severity.url) if cve_match: diff --git a/vulnerabilities/pipelines/collect_commits.py b/vulnerabilities/pipelines/collect_commits.py index 92145c051..3d60b338f 100644 --- a/vulnerabilities/pipelines/collect_commits.py +++ b/vulnerabilities/pipelines/collect_commits.py @@ -52,7 +52,7 @@ def collect_and_store_fix_commits(self): ) for apv in progress.iter( - affected_by_package_related_vulnerabilities.paginated(per_page=500) + affected_by_package_related_vulnerabilities.iterator(chunk_size=500) ): vulnerability = apv.vulnerability for reference in vulnerability.references.all(): diff --git a/vulnerabilities/pipelines/compute_package_risk.py b/vulnerabilities/pipelines/compute_package_risk.py index 7ac4de838..8e1a47ce1 100644 --- a/vulnerabilities/pipelines/compute_package_risk.py +++ b/vulnerabilities/pipelines/compute_package_risk.py @@ -54,7 +54,9 @@ def compute_and_store_vulnerability_risk_score(self): updated_vulnerability_count = 0 batch_size = 5000 - for vulnerability in progress.iter(affected_vulnerabilities.paginated(per_page=batch_size)): + for vulnerability in progress.iter( + affected_vulnerabilities.iterator(chunk_size=batch_size) + ): severities = vulnerability.severities.all() references = vulnerability.references.all() exploits = vulnerability.exploits.all() @@ -110,7 +112,7 @@ def compute_and_store_package_risk_score(self): updated_package_count = 0 batch_size = 10000 - for package in progress.iter(affected_packages.paginated(per_page=batch_size)): + for package in progress.iter(affected_packages.iterator(chunk_size=batch_size)): risk_score = compute_package_risk(package) if not risk_score: diff --git a/vulnerabilities/pipelines/flag_ghost_packages.py b/vulnerabilities/pipelines/flag_ghost_packages.py index 7daee4115..33e504676 100644 --- a/vulnerabilities/pipelines/flag_ghost_packages.py +++ b/vulnerabilities/pipelines/flag_ghost_packages.py @@ -49,7 +49,7 @@ def detect_and_flag_ghost_packages(logger=None): ) grouped_packages = groupby( - interesting_packages_qs.paginated(), + interesting_packages_qs.iterator(), key=lambda pkg: (pkg.type, pkg.namespace, pkg.name), )