chg: Dataset generation: CVSS, CPE, title and description (summary) are now extracted from CSAF document.

cedricbonhomme · cedricbonhomme · commit de0d9b52d8d0 · 2025-03-11T08:26:03.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,7 @@
 ### Changes
 
 - Dataset generation: CVSS are now extracted from GitHub and PySec security advisories.
+- Dataset generation: CVSS, CPE, title and description (summary) are now extracted from CSAF document.
 
 
 ## Release 1.1.0 (2025-02-27)
diff --git a/vulntrain/datasets/create_dataset.py b/vulntrain/datasets/create_dataset.py
@@ -9,9 +9,11 @@
 from vulntrain.utils import (
     strip_markdown,
     extract_cpe,
+    extract_cpe_csaf,
     extract_cvss_cve,
     extract_cvss_from_github_advisory,
     extract_cvss_from_pysec,
+    extract_cvss_from_csaf,
 )
 
 
@@ -120,6 +122,40 @@ def extract_pysec(self, vuln: dict[str, Any]) -> dict[str, Any]:
             "cvss_v2_0": cvss_scores.get("cvss_v2_0", None),
         }
 
+    def extract_csaf(self, vuln: dict[str, Any]) -> dict[str, Any]:
+
+        cvss_scores = extract_cvss_from_csaf(vuln)
+
+        description = ""
+        description = " ".join(
+            [
+                note["text"]
+                for vulnerability in vuln.get("vulnerabilities", [])
+                for note in vulnerability.get("notes", [])
+                if note.get("category") == "summary"
+            ]
+        )
+        if not description:
+            description = next(
+                (
+                    note["text"]
+                    for note in vuln.get("document", {}).get("notes", [])
+                    if note.get("category") == "summary"
+                ),
+                "",
+            )
+
+        return {
+            "id": vuln["document"]["tracking"]["id"],
+            "title": vuln["document"]["title"],
+            "description": description,
+            "cpes": extract_cpe_csaf(vuln),
+            "cvss_v4_0": cvss_scores.get("cvss_v4_0", None),
+            "cvss_v3_1": cvss_scores.get("cvss_v3_1", None),
+            "cvss_v3_0": cvss_scores.get("cvss_v3_0", None),
+            "cvss_v2_0": cvss_scores.get("cvss_v2_0", None),
+        }
+
     def __call__(self) -> Generator[dict[str, Any], None, None]:
         count = 0
         for source in self.sources:
@@ -130,6 +166,8 @@ def __call__(self) -> Generator[dict[str, Any], None, None]:
                     extractor = self.extract_ghsa
                 case "pysec":
                     extractor = self.extract_pysec
+                case str() as s if s.startswith("csaf_"):
+                    extractor = self.extract_csaf
                 case _:
                     print("No parser for this source.")
                     continue
diff --git a/vulntrain/utils.py b/vulntrain/utils.py
@@ -158,3 +158,51 @@ def extract_cvss_from_pysec(data) -> dict[str, float]:
                 continue
 
     return cvss_scores
+
+
+def extract_cvss_from_csaf(data) -> dict[str, float]:
+    cvss_scores = {}
+
+    for vulnerability in data.get("vulnerabilities", []):
+
+        for score in vulnerability.get("scores", []):
+            for _, value in score.items():
+                if type(value) is dict:
+                    if vector := value.get("vectorString", ""):
+                        match = re.search(r"CVSS:(\d\.\d)", vector)
+                        if match:
+                            try:
+                                cvss_scores[format_cvss_version(match.group(1))] = (
+                                    float(
+                                        cvss_base_score(
+                                            vector, format_cvss_version(match.group(1))
+                                        )
+                                    )
+                                )
+                            except Exception:
+                                continue
+
+    return cvss_scores
+
+
+def extract_cpe_csaf(data):
+    cpe_list = []
+
+    def extract_cpe(branches):
+        """Recursively extract CPEs from product_tree branches."""
+        for branch in branches:
+            product = branch.get("product", {})
+            cpe = product.get("product_identification_helper", {}).get("cpe")
+            if cpe:
+                cpe_list.append(cpe)
+
+            # Check if the branch contains nested branches
+            if "branches" in branch:
+                extract_cpe(branch["branches"])
+
+    # Start extraction from product_tree
+    product_tree = data.get("product_tree", {}).get("branches", [])
+    extract_cpe(product_tree)
+
+    # Print unique CPEs
+    return sorted(set(cpe_list))