diff --git a/README.md b/README.md index 2daa9c1..413c13b 100644 --- a/README.md +++ b/README.md @@ -1,19 +1,16 @@ # Introduction -This repo is a vulnerability database and package search for sources such as Aqua Security vuln-list, OSV, NVD, GitHub, and NPM. Vulnerability data are downloaded from the sources and stored in a sqlite based storage with indexes to allow offline access and quick searches. +This repo is a vulnerability database and package search for sources such as AppThreat vuln-list, OSV, NVD, and GitHub. Vulnerability data are downloaded from the sources and stored in a sqlite based storage with indexes to allow offline access and quick searches. ## Vulnerability Data sources - Linux [vuln-list](https://github.com/appthreat/vuln-list) (Forked from AquaSecurity) - OSV (1) -- NVD (2) +- NVD - GitHub -- NPM 1 - We exclude Linux and oss-fuzz feeds by default. Set the environment variable `OSV_INCLUDE_FUZZ` to include them. -2 - We exclude hardware (h) by default. Set the environment variable `NVD_EXCLUDE_TYPES` to exclude additional types such as OS (o) or application (a). An empty value means include all categories. Comma-separated values are allowed. Eg: `o,h` - ## Linux distros - AlmaLinux diff --git a/contrib/cpe_research.py b/contrib/cpe_research.py new file mode 100644 index 0000000..4d96759 --- /dev/null +++ b/contrib/cpe_research.py @@ -0,0 +1,63 @@ +from typing import Any + +import apsw +import orjson + +from vdb.lib import KNOWN_PKG_TYPES, db6 +from vdb.lib.cve_model import CVE, CVE1 + + +def get_cve_data(db_conn, index_hits: list[dict, Any]): + """Get CVE data for the index results + + Args: + db_conn: DB Connection or None to create a new one + index_hits: Hits from one of the search methods + search_str: Original search string used + + Returns: + generator: generator for CVE data with original source data as a pydantic model + """ + if not db_conn: + db_conn, _ = db6.get(read_only=True) + for ahit in index_hits: + results: apsw.Cursor = db_conn.execute( + "SELECT cve_id, type, namespace, name, json_object('source', source_data) FROM cve_data WHERE cve_id = ? AND type = ? ORDER BY cve_id DESC;", + (ahit[0], ahit[1]), + ) + for res in results: + yield { + "cve_id": res[0], + "type": res[1], + "namespace": res[2], + "name": res[3], + "purl_prefix": ahit[-1], + "source_data": ( + CVE( + root=CVE1.model_validate( + orjson.loads(res[4])["source"], strict=False + ) + ) + if res[4] + else None + ) + } + + +def get_unmapped_namespaces() -> list: + """Get a list of namespaces without a precise purl prefix""" + db_conn, index_conn = db6.get(read_only=True) + raw_hits = index_conn.execute(f"""select distinct cve_id, type, namespace, name, purl_prefix from cve_index where type not in ({', '.join([f"'{p}'" for p in KNOWN_PKG_TYPES])})""") + for ahit in raw_hits: + data_list_gen = get_cve_data(db_conn, [ahit]) + for data_list in data_list_gen: + source_data: CVE1 = data_list["source_data"].root + affected = source_data.containers.cna.affected.root + cpes = [", ".join([b.root for b in a.cpes]) for a in affected] + references = source_data.containers.cna.references.root + ref_urls = [str(a.url.root) for a in references] + print(data_list["cve_id"], data_list["type"], data_list["namespace"], data_list["name"], data_list["purl_prefix"], cpes, ref_urls) + + +if __name__ == "__main__": + get_unmapped_namespaces() diff --git a/vdb/lib/__init__.py b/vdb/lib/__init__.py index 29b8d9d..1b3b936 100644 --- a/vdb/lib/__init__.py +++ b/vdb/lib/__init__.py @@ -9,6 +9,7 @@ # Known application package types # See https://github.com/package-url/purl-spec/blob/master/PURL-TYPES.rst +# chainguard and wolfi has been added for suppression purposes since the data quality is poor KNOWN_PKG_TYPES = [ "alpm", "bitbucket", @@ -33,7 +34,6 @@ "gem", "rubygems", "golang", - "crates", "clojars", "conan", "pub", @@ -50,20 +50,21 @@ "linux", "swid", "oss-fuzz", - "ebuild" + "ebuild", + "swift", ] # Maps variations of string to package types PKG_TYPES_MAP = { "composer": ["php", "laravel", "wordpress", "joomla"], "maven": ["jenkins", "java", "kotlin", "groovy", "clojars", "hackage"], - "npm": ["javascript", "node.js", "nodejs"], + "npm": ["javascript", "node.js", "nodejs", "npmjs"], "nuget": [".net_framework", "csharp", ".net_core", "asp.net"], "pypi": ["python"], "gem": ["ruby"], "rubygems": ["ruby", "gem"], "golang": ["go"], - "cargo": ["rust", "crates.io", "cargo"], + "cargo": ["rust", "crates.io", "crates"], "pub": ["dart"], "hex": ["elixir"], "github": ["actions"], @@ -78,6 +79,7 @@ "suse", "opensuse", "fedora", + "fedoraproject" ], "alpm": ["arch", "archlinux"], "ebuild": ["gentoo", "portage"] diff --git a/vdb/lib/aqua.py b/vdb/lib/aqua.py index 3235673..c59fc7d 100644 --- a/vdb/lib/aqua.py +++ b/vdb/lib/aqua.py @@ -189,8 +189,8 @@ def alsa_to_vuln(cve_data): references=references, description="", vectorString=vector_string, - vendor=vendor, - product=pkg_name, + vendor="rpm", + product=f"{vendor}/{pkg_name}", version="*", edition="*", version_start_including=version_start_including, @@ -222,9 +222,9 @@ def alas_rlsa_to_vuln(cve_data, vendor): """Amazon Linux""" ret_data = [] packages = cve_data.get("packages", []) - if not packages or not len(packages) > 0: - return ret_data cve_id = cve_data.get("id") + if not packages or cve_id in ("CVE-PENDING",) or not len(packages) > 0: + return ret_data cwe_id = "" cve_references = cve_data.get("references", []) references = [] @@ -272,8 +272,8 @@ def alas_rlsa_to_vuln(cve_data, vendor): references=references, description="", vectorString=vector_string, - vendor=vendor, - product=pkg_name, + vendor="rpm", + product=f"{vendor}/{pkg_name}", version="*", edition="*", version_start_including=version_start_including, @@ -372,8 +372,8 @@ def ubuntu_to_vuln(cve_data): references=references, description="", vectorString=vector_string, - vendor=vendor, - product=full_pkg_name, + vendor="deb", + product=f"{vendor}/{full_pkg_name}", version="*", edition=distro_name, version_start_including=version_start_including, @@ -483,8 +483,8 @@ def redhat_to_vuln(cve_data): references=references, description="", vectorString=vector_string, - vendor="redhat", - product=pkg_name, + vendor="rpm", + product=f"redhat/{pkg_name}", version="*", edition=edition, version_start_including=version_start_including, @@ -549,8 +549,8 @@ def arch_to_vuln(cve_data): references=references, description="", vectorString=vector_string, - vendor="arch", - product=pkg_name, + vendor="alpm", + product=f"arch/{pkg_name}", version="*", edition="*", version_start_including=version_start_including, @@ -645,8 +645,8 @@ def suse_to_vuln(self, cve_data): references=references, description="", vectorString=vector_string, - vendor="suse", - product=pkg_name, + vendor="rpm", + product=f"suse/{pkg_name}", version="*", edition="*", version_start_including=version_start_including, @@ -717,8 +717,8 @@ def photon_to_vuln(cve_data): references=references, description="", vectorString=vector_string, - vendor="photon", - product=pkg_name, + vendor="rpm", + product=f"photon/{pkg_name}", version="*", edition=distro_name, version_start_including=version_start_including, @@ -845,8 +845,8 @@ def debian_to_vuln(cve_data): references=references, description="", vectorString=vector_string, - vendor=vendor, - product=pkg_name, + vendor="deb", + product=f"{vendor}/{pkg_name}", version="*", edition=distro_name if distro_name else "*", version_start_including=version_start_including, @@ -891,9 +891,9 @@ def wolfi_to_vuln(cve_data): for fix_version_start_including, cve_list in cve_data.get("secfixes").items(): for cve_id in cve_list: version_start_including = "" - version_end_including = "" + version_end_including = "*" if fix_version_start_including == "0" else "" version_start_excluding = "" - version_end_excluding = fix_version_start_including + version_end_excluding = fix_version_start_including if fix_version_start_including != "0" else "" fix_version_end_including = "" fix_version_start_excluding = "" fix_version_end_excluding = "" @@ -904,8 +904,8 @@ def wolfi_to_vuln(cve_data): references=references, description="", vectorString=vector_string, - vendor=assigner, - product=pkg_name, + vendor="apk", + product=f"{assigner}/{pkg_name}", version="*", edition="*", version_start_including=version_start_including, diff --git a/vdb/lib/config.py b/vdb/lib/config.py index 5199183..510d9aa 100644 --- a/vdb/lib/config.py +++ b/vdb/lib/config.py @@ -77,11 +77,6 @@ VULN_LIST_URL = "https://github.com/appthreat/vuln-list/archive/refs/heads/main.zip" -# CVE types to exclude - hardware -nvd_exclude_types = ["h"] -if os.getenv("NVD_EXCLUDE_TYPES") is not None: - nvd_exclude_types = os.getenv("NVD_EXCLUDE_TYPES", "").split(",") - # Placeholder fix version to use to indicate max versions PLACEHOLDER_FIX_VERSION = "99.99.9" @@ -117,11 +112,32 @@ VENDOR_TO_VERS_SCHEME = { "almalinux": "rpm", + "rocky": "rpm", + "photon": "rpm", "ubuntu": "deb", "debian": "deb", "suse": "rpm", "redhat": "rpm", "opensuse": "rpm", "alpine": "apk", - "gentoo": "ebuild" + "gentoo": "ebuild", + "amazon": "rpm", + "wolfi": "apk", + "chainguard": "apk" } + +OS_PKG_TYPES = ( + "deb", + "apk", + "rpm", + "swid", + "alpm", + "docker", + "oci", + "container", + "generic", + "qpkg", + "buildroot", + "coreos", + "ebuild", +) diff --git a/vdb/lib/cve.py b/vdb/lib/cve.py index 3d6f4e6..1011512 100644 --- a/vdb/lib/cve.py +++ b/vdb/lib/cve.py @@ -177,7 +177,12 @@ def to_cve_affected(avuln: Vulnerability) -> Affected | None: package_name = parts.group("package") if "/" in product: tmp_a = product.split("/") - if len(tmp_a) != 2: + # ubuntu/upstream/virtualbox should become + # product=ubuntu and package_name=upstream/virtualbox + if vendor in config.OS_PKG_TYPES: + product = tmp_a[0] + package_name = "/".join(tmp_a[1:]) + elif len(tmp_a) != 2: if len(tmp_a) > 2 and vendor in ("generic", "swift"): product = os.path.dirname(product) package_name = os.path.basename(package_name) @@ -202,8 +207,15 @@ def to_cve_affected(avuln: Vulnerability) -> Affected | None: if config.VENDOR_TO_VERS_SCHEME.get(vendor): vendor = config.VENDOR_TO_VERS_SCHEME.get(vendor) # This prevents cargo:cargo or nuget:nuget - if product == vendor and vendor in KNOWN_PKG_TYPES: + # or openssl:openssl:openssl + if product == vendor and (package_name == product or vendor in KNOWN_PKG_TYPES): product = None + # Deal with NVD mess such as npmjs or crates + if vendor not in KNOWN_PKG_TYPES: + for k, v in PKG_TYPES_MAP.items(): + if vendor.lower() in v: + vendor = k + break p = Product( vendor=vendor, product=product, diff --git a/vdb/lib/cve_model/__init__.py b/vdb/lib/cve_model/__init__.py index b84e048..7d4c44b 100644 --- a/vdb/lib/cve_model/__init__.py +++ b/vdb/lib/cve_model/__init__.py @@ -7,7 +7,7 @@ from enum import Enum from typing import Annotated, Any, Dict, List, Optional, Type, Union -from pydantic import AnyUrl, AwareDatetime, BaseModel, ConfigDict, Field, RootModel +from pydantic import AnyUrl, AwareDatetime, BaseModel, ConfigDict, Field, NaiveDatetime, RootModel from vdb.lib.cve_model import cvss_v2, cvss_v3 @@ -513,7 +513,7 @@ class CveMetadataPublished(BaseModel): Field(None, description="The user that requested the CVE identifier."), ] dateUpdated: Annotated[ - Optional[AwareDatetime], + Optional[AwareDatetime | NaiveDatetime], Field(None, description="The date/time the record was last updated."), ] serial: Annotated[ @@ -525,14 +525,14 @@ class CveMetadataPublished(BaseModel): ), ] dateReserved: Annotated[ - Optional[AwareDatetime], + Optional[AwareDatetime | NaiveDatetime], Field( None, description="The date/time this CVE ID was reserved in the CVE automation workgroup services system. Disclaimer: This date reflects when the CVE ID was reserved, and does not necessarily indicate when this vulnerability was discovered, shared with the affected vendor, publicly disclosed, or updated in CVE.", ), ] datePublished: Annotated[ - Optional[AwareDatetime], + Optional[AwareDatetime | NaiveDatetime], Field( None, description="The date/time the CVE Record was first published in the CVE List.", @@ -574,23 +574,23 @@ class CveMetadataRejected(BaseModel): ), ] dateUpdated: Annotated[ - Optional[AwareDatetime], + Optional[AwareDatetime | NaiveDatetime], Field(None, description="The date/time the record was last updated."), ] datePublished: Annotated[ - Optional[AwareDatetime], + Optional[AwareDatetime | NaiveDatetime], Field( None, description="The date/time the CVE Record was first published in the CVE List.", ), ] dateRejected: Annotated[ - Optional[AwareDatetime], + Optional[AwareDatetime | NaiveDatetime], Field(None, description="The date/time the CVE ID was rejected."), ] state: Annotated[State1, Field(description="State of CVE - PUBLISHED, REJECTED.")] dateReserved: Annotated[ - Optional[AwareDatetime], + Optional[AwareDatetime | NaiveDatetime], Field( None, description="The date/time this CVE ID was reserved in the CVE automation workgroup services system. Disclaimer: This date reflects when the CVE ID was reserved, and does not necessarily indicate when this vulnerability was discovered, shared with the affected vendor, publicly disclosed, or updated in CVE.", @@ -607,7 +607,7 @@ class ProviderMetadata(BaseModel): Field(None, description="The container provider's organizational short name."), ] dateUpdated: Annotated[ - Optional[AwareDatetime], + Optional[AwareDatetime | NaiveDatetime], Field( None, description="Timestamp to be set by the system of record at time of submission. If dateUpdated is provided to the system of record it will be replaced by the current timestamp at the time of submission.", @@ -1212,14 +1212,14 @@ class CnaPublishedContainer(BaseModel): ) providerMetadata: ProviderMetadata dateAssigned: Annotated[ - Optional[AwareDatetime], + Optional[AwareDatetime | NaiveDatetime], Field( None, description="The date/time this CVE ID was associated with a vulnerability by a CNA.", ), ] datePublic: Annotated[ - Optional[AwareDatetime], + Optional[AwareDatetime | NaiveDatetime], Field( None, description="If known, the date/time the vulnerability was disclosed publicly.", @@ -1260,7 +1260,7 @@ class AdpContainer(BaseModel): ) providerMetadata: ProviderMetadata datePublic: Annotated[ - Optional[AwareDatetime], + Optional[AwareDatetime | NaiveDatetime], Field( None, description="If known, the date/time the vulnerability was disclosed publicly.", diff --git a/vdb/lib/nvd.py b/vdb/lib/nvd.py index 7bc6993..eea9a3e 100644 --- a/vdb/lib/nvd.py +++ b/vdb/lib/nvd.py @@ -215,8 +215,7 @@ def convert_vuln_detail(vuln: dict) -> list[VulnerabilityDetail] | None: if fix_cpe_uri: det["fixed_location"] = fix_cpe_uri adetail = VulnerabilityDetail.from_dict(det) - # Include only the application details - if adetail and adetail.package_type not in config.nvd_exclude_types: + if adetail: details.append(adetail) if not details: return None @@ -249,6 +248,9 @@ def convert_api_vuln_detail(vuln: dict) -> list[VulnerabilityDetail] | None: cpe_details_list = [] for cpe in cpe_list: cpe_uri = cpe["criteria"] + # Ignore os and hardware vulnerabilities from nvd + if cpe_uri and cpe_uri.startswith("cpe:2.3:o") or cpe_uri.startswith("cpe:2.3:h"): + continue all_parts = CPE_FULL_REGEX.match(cpe_uri) # If a single version is mentioned using cpe then use that as a fallback single_version = "" @@ -284,8 +286,7 @@ def convert_api_vuln_detail(vuln: dict) -> list[VulnerabilityDetail] | None: cpe_details_list.append(new_git_detail) for det in cpe_details_list: adetail = VulnerabilityDetail.from_dict(det) - # Include only the application details - if adetail and adetail.package_type not in config.nvd_exclude_types: + if adetail: details.append(adetail) if not details: return None diff --git a/vdb/lib/search.py b/vdb/lib/search.py index 73f79f0..09a5fdd 100644 --- a/vdb/lib/search.py +++ b/vdb/lib/search.py @@ -3,7 +3,7 @@ import orjson from vdb.lib import db6, utils -from vdb.lib.cve_model import CVE +from vdb.lib.cve_model import CVE, CVE1 def _filter_hits(raw_hits: list, compare_ver: str) -> list: @@ -48,7 +48,7 @@ def get_cve_data(db_conn, index_hits: list[dict, Any], search_str: str) -> list[ "name": res[3], "matching_vers": ahit["vers"], "matched_by": search_str, - "source_data": CVE.model_validate(orjson.loads(res[4])["source"], strict=False) if res[4] else None, + "source_data": CVE(root=CVE1.model_validate(orjson.loads(res[4])["source"], strict=False)) if res[4] else None, "override_data": orjson.loads(res[5])["override"] if res[5] else None }) return data_list