Skip to content

Commit de0d9b5

Browse files
chg: Dataset generation: CVSS, CPE, title and description (summary) are now extracted from CSAF document.
1 parent afcfc9a commit de0d9b5

File tree

3 files changed

+87
-0
lines changed

3 files changed

+87
-0
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
### Changes
66

77
- Dataset generation: CVSS are now extracted from GitHub and PySec security advisories.
8+
- Dataset generation: CVSS, CPE, title and description (summary) are now extracted from CSAF document.
89

910

1011
## Release 1.1.0 (2025-02-27)

vulntrain/datasets/create_dataset.py

+38
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,11 @@
99
from vulntrain.utils import (
1010
strip_markdown,
1111
extract_cpe,
12+
extract_cpe_csaf,
1213
extract_cvss_cve,
1314
extract_cvss_from_github_advisory,
1415
extract_cvss_from_pysec,
16+
extract_cvss_from_csaf,
1517
)
1618

1719

@@ -120,6 +122,40 @@ def extract_pysec(self, vuln: dict[str, Any]) -> dict[str, Any]:
120122
"cvss_v2_0": cvss_scores.get("cvss_v2_0", None),
121123
}
122124

125+
def extract_csaf(self, vuln: dict[str, Any]) -> dict[str, Any]:
126+
127+
cvss_scores = extract_cvss_from_csaf(vuln)
128+
129+
description = ""
130+
description = " ".join(
131+
[
132+
note["text"]
133+
for vulnerability in vuln.get("vulnerabilities", [])
134+
for note in vulnerability.get("notes", [])
135+
if note.get("category") == "summary"
136+
]
137+
)
138+
if not description:
139+
description = next(
140+
(
141+
note["text"]
142+
for note in vuln.get("document", {}).get("notes", [])
143+
if note.get("category") == "summary"
144+
),
145+
"",
146+
)
147+
148+
return {
149+
"id": vuln["document"]["tracking"]["id"],
150+
"title": vuln["document"]["title"],
151+
"description": description,
152+
"cpes": extract_cpe_csaf(vuln),
153+
"cvss_v4_0": cvss_scores.get("cvss_v4_0", None),
154+
"cvss_v3_1": cvss_scores.get("cvss_v3_1", None),
155+
"cvss_v3_0": cvss_scores.get("cvss_v3_0", None),
156+
"cvss_v2_0": cvss_scores.get("cvss_v2_0", None),
157+
}
158+
123159
def __call__(self) -> Generator[dict[str, Any], None, None]:
124160
count = 0
125161
for source in self.sources:
@@ -130,6 +166,8 @@ def __call__(self) -> Generator[dict[str, Any], None, None]:
130166
extractor = self.extract_ghsa
131167
case "pysec":
132168
extractor = self.extract_pysec
169+
case str() as s if s.startswith("csaf_"):
170+
extractor = self.extract_csaf
133171
case _:
134172
print("No parser for this source.")
135173
continue

vulntrain/utils.py

+48
Original file line numberDiff line numberDiff line change
@@ -158,3 +158,51 @@ def extract_cvss_from_pysec(data) -> dict[str, float]:
158158
continue
159159

160160
return cvss_scores
161+
162+
163+
def extract_cvss_from_csaf(data) -> dict[str, float]:
164+
cvss_scores = {}
165+
166+
for vulnerability in data.get("vulnerabilities", []):
167+
168+
for score in vulnerability.get("scores", []):
169+
for _, value in score.items():
170+
if type(value) is dict:
171+
if vector := value.get("vectorString", ""):
172+
match = re.search(r"CVSS:(\d\.\d)", vector)
173+
if match:
174+
try:
175+
cvss_scores[format_cvss_version(match.group(1))] = (
176+
float(
177+
cvss_base_score(
178+
vector, format_cvss_version(match.group(1))
179+
)
180+
)
181+
)
182+
except Exception:
183+
continue
184+
185+
return cvss_scores
186+
187+
188+
def extract_cpe_csaf(data):
189+
cpe_list = []
190+
191+
def extract_cpe(branches):
192+
"""Recursively extract CPEs from product_tree branches."""
193+
for branch in branches:
194+
product = branch.get("product", {})
195+
cpe = product.get("product_identification_helper", {}).get("cpe")
196+
if cpe:
197+
cpe_list.append(cpe)
198+
199+
# Check if the branch contains nested branches
200+
if "branches" in branch:
201+
extract_cpe(branch["branches"])
202+
203+
# Start extraction from product_tree
204+
product_tree = data.get("product_tree", {}).get("branches", [])
205+
extract_cpe(product_tree)
206+
207+
# Print unique CPEs
208+
return sorted(set(cpe_list))

0 commit comments

Comments
 (0)