From be62961f773af9eefe60bf62e4c5806167d0d494 Mon Sep 17 00:00:00 2001 From: Mateusz Perc Date: Thu, 5 Aug 2021 04:57:47 +0200 Subject: [PATCH 1/2] Add AlpinePackages pipeline A pipeline that complements missing package data. Downloads aports repository and all its necessary branches (alpine versions) then iterates over all alpine packages associated with the pipeline's project. For each package it copies additional files from the aports repository into scan target directory then downloads and extract all the source archives, performs a scan and saves it's output to package's database entry. Signed-off-by: Mateusz Perc --- scanpipe/pipelines/alpine_packages.py | 107 ++++++++++++++++++++++++++ scanpipe/pipes/alpine.py | 106 +++++++++++++++++++++++++ scanpipe/pipes/docker.py | 1 + setup.py | 1 + 4 files changed, 215 insertions(+) create mode 100644 scanpipe/pipelines/alpine_packages.py diff --git a/scanpipe/pipelines/alpine_packages.py b/scanpipe/pipelines/alpine_packages.py new file mode 100644 index 000000000..bbfd6b6a1 --- /dev/null +++ b/scanpipe/pipelines/alpine_packages.py @@ -0,0 +1,107 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/nexB/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/nexB/scancode.io for support and download. + +from scanpipe.pipelines import Pipeline +from scanpipe.pipes.alpine import download_or_checkout_aports +from scanpipe.pipes.alpine import extract_summary_fields +from scanpipe.pipes.alpine import get_unscanned_packages_from_db +from scanpipe.pipes.alpine import prepare_scan_dir +from scanpipe.pipes.scancode import run_extractcode +from scanpipe.pipes.scancode import run_scancode + + +class AlpinePackages(Pipeline): + """ + A pipeline to complement missing alpine package data. + Downloads and extracts needed information from aports repository and package source files. + Alpine Linux does not provide copyrights and (in some cases) licenses for it's packages. + """ + + @classmethod + def steps(cls): + return ( + cls.create_alpine_versions_dict, + cls.download_aports_repo, + cls.complement_missing_package_data, + ) + + scancode_options = ["--copyright", "--summary"] + + def create_alpine_versions_dict(self): + """ + Create a dict mapping alpine image ids from the database to alpine versions. + """ + self.alpine_versions = { + i["image_id"]: i["distro"]["version_id"] + for i in self.project.extra_data["images"] + if i["distro"]["identifier"] == "alpine" + } + + def download_aports_repo(self): + """ + Set pipeline's `aports_dir_path` variable to it's project temporary path. + Iterate over every alpine version associated with this project. + Download corresponding aports repository branches (alpine versions). + """ + self.aports_dir_path = self.project.tmp_path + for image_id, alpine_version in self.alpine_versions.items(): + download_or_checkout_aports( + aports_dir_path=self.project.tmp_path, alpine_version=alpine_version + ) + + def complement_missing_package_data(self): + """ + Iterate over alpine packages associated with this project. + Checkout aports repository to the corresponding alpine version and a commit. + Prepare scan target directory - download and extract package's sources. + Run scancode and extract missing data (only copyrights for now). + Update and save package's missing data to database. + """ + for ( + alpine_version, + commit_id, + scan_target_path, + scan_result_path, + package, + ) in get_unscanned_packages_from_db( + project=self.project, alpine_versions=self.alpine_versions + ): + if not download_or_checkout_aports( + aports_dir_path=self.aports_dir_path, + alpine_version=alpine_version, + commit_id=commit_id, + ) or not prepare_scan_dir( + package_name=package.name, scan_target_path=scan_target_path + ): + continue + run_extractcode(location=str(scan_target_path)) + run_scancode( + location=str(scan_target_path), + output_file=str(scan_result_path), + options=self.scancode_options, + ) + package.update_extra_data( + data=extract_summary_fields( + scan_result_path=scan_result_path, + summary_field_names=["copyrights"], + ) + ) diff --git a/scanpipe/pipes/alpine.py b/scanpipe/pipes/alpine.py index 5340b27d7..3d22dd98c 100644 --- a/scanpipe/pipes/alpine.py +++ b/scanpipe/pipes/alpine.py @@ -20,8 +20,114 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/nexB/scancode.io for support and download. + +import json +from shutil import copytree + +from fetchcode import fetch +from fetchcode.vcs.git import fetch_via_git from packagedcode import alpine +from scanpipe.models import DiscoveredPackage + +APORTS_URL = "https://gitlab.alpinelinux.org/alpine/aports.git" +APORTS_DIR_NAME = "aports" +APORTS_SUBDIRS = ["main", "non-free", "testing", "community", "unmaintained"] + + +def download_or_checkout_aports(aports_dir_path, alpine_version, commit_id=None): + """ + Download aports repository and it's branch based on `alpine_version`. + Checkout to a branch (alpine version). + If `commit_id` is provided also checkout to a commit. + Return `aports_dir_path` if checkout(s) succeded. #TODO Proper fetchcode patch required (extending #54) + """ + major, minor = alpine_version.split(".")[:2] + aports_dir_path = str(aports_dir_path / APORTS_DIR_NAME) + fetch_via_git( + url=f"git+{APORTS_URL}@{major}.{minor}-stable", location=aports_dir_path + ) + if commit_id: + fetch_via_git(url=f"git+{APORTS_URL}@{commit_id}", location=aports_dir_path) + return aports_dir_path + + +def get_unscanned_packages_from_db(project, alpine_versions): + """ + Return an iterator of 5-tuples (alpine_version, commit_id, scan_target_path, scan_result_path, package) where: + `alpine_version` is an alpine version from which a package comes from (obtained from `alpine_versions` dict), + `commit_id` is an id of aports repository commit that added corresponding version of a package, + `scan_target_path` is a path of the directory on which a scan will be performed, + `scan_result_path` is a path of the scan result json file, + `package` is a DiscoveredPackage instance that belongs to a `project` with an alpine package type. + The returned iterator contains not-a-subpackage alpine packages that don't have an existing scan result file. + """ + for package in DiscoveredPackage.objects.filter(project=project, type="alpine"): + scan_id = f"{package.name}_{package.version}" + scan_result_path = project.output_path / (scan_id + ".json") + alpine_version = alpine_versions.get(package.extra_data["image_id"]) + commit_id = package.vcs_url.split("id=")[1] + scan_target_path = project.tmp_path / scan_id + not_a_subpackage = ( + not package.source_packages or package.source_packages[0] in package.purl + ) + scan_result_nonexistent = not scan_result_path.exists() + if not_a_subpackage and scan_result_nonexistent: + yield alpine_version, commit_id, scan_target_path, scan_result_path, package + + +def prepare_scan_dir(package_name, scan_target_path, aports_dir_path=None): + """ + A function to gather all the package's source files in `scan_target_path`. + Source files of an alpine package are obtained from it's aports directory whose location has to be guessed. + Such directory is present in one of the five aports repository subdirectories (main, non-free, testing, community, unmaintained). + It's name is the same as the value of the corresponding package's `name` field (hence the `package_name` parameter). + Here are some path examples: + .../aports/main/acf-db + .../aports/non-free/mongodb + Inside, there are some extra files (patches) and an APKBUILD which contains urls to source tarballs. + The function copies all these files (including APKBUILD) and downloads all the source tarballs to `scan_target_path`. + The default value of `aports_dir_path` is set to the parent of the `scan_target_path`. + If the package's aports path is found/guessed and it's also not empty the returned value is `scan_target_path`. + """ + if aports_dir_path is None: + aports_dir_path = scan_target_path.parent + for subdir_name in APORTS_SUBDIRS: + apkbuild_dir = aports_dir_path / APORTS_DIR_NAME / subdir_name / package_name + if not apkbuild_dir.exists(): + continue + if not any(apkbuild_dir.iterdir()): + break + copytree(apkbuild_dir, scan_target_path) + package_sources = ( + alpine.parse_apkbuild(scan_target_path / "APKBUILD") + .to_dict() + .get("extra_data") + .get("sources") + or [] + ) + for source in package_sources: + source_url = source.get("url") + if source_url: + fetch(source_url, scan_target_path) + return scan_target_path + + +def extract_summary_fields(scan_result_path, summary_field_names): + """ + Having a scancode result file extract all the values from the `summary` section of the scan result file (`scan_result_path`). + Put them in the arrays inside the `result` object (result[`field_name`]). + Return `result`. + """ + scan_result = open(scan_result_path) + summaries = json.load(scan_result)["summary"] + scan_result.close() + result = {} + for field_name in summary_field_names: + values = (summary["value"] for summary in summaries.get(field_name, [])) + result[field_name] = [v for v in values if v] + return result + def package_getter(root_dir, **kwargs): """ diff --git a/scanpipe/pipes/docker.py b/scanpipe/pipes/docker.py index b82d5f800..8d36614cc 100644 --- a/scanpipe/pipes/docker.py +++ b/scanpipe/pipes/docker.py @@ -122,6 +122,7 @@ def scan_image_for_system_packages(project, image, detect_licenses=True): for i, (purl, package, layer) in enumerate(installed_packages): logger.info(f"Creating package #{i}: {purl}") + package.extra_data = {"image_id": image.image_id} created_package = pipes.update_or_create_package(project, package.to_dict()) # We have no files for this installed package, we cannot go further. diff --git a/setup.py b/setup.py index 6acaec642..b1666e5ec 100755 --- a/setup.py +++ b/setup.py @@ -61,6 +61,7 @@ "root_filesystems = scanpipe.pipelines.root_filesystems:RootFS", "scan_codebase = scanpipe.pipelines.scan_codebase:ScanCodebase", "scan_package = scanpipe.pipelines.scan_package:ScanPackage", + "alpine_packages = scanpipe.pipelines.alpine_packages:AlpinePackages" ], }, classifiers=[ From ee6d8a256fcc36d7630c3eadbff9f978359c17db Mon Sep 17 00:00:00 2001 From: Mateusz Perc Date: Mon, 20 Sep 2021 08:28:18 +0200 Subject: [PATCH 2/2] Added test for new alpine pipe functions Added new tests for functions: -download_or_checkout_aports -get_unscanned_packages_from_db -prepare_scan_dir -extract_summary_fields Signed-off-by: Mateusz Perc --- .../tests/data/aports/community/A/APKBUILD | 0 .../tests/data/aports/community/D/APKBUILD | 0 .../tests/data/aports/community/E/NOTAPKBUILD | 0 scanpipe/tests/data/aports/example/C/APKBUILD | 0 scanpipe/tests/data/example_scan_summary.json | 34 +++++++ scanpipe/tests/test_pipes.py | 97 +++++++++++++++++++ 6 files changed, 131 insertions(+) create mode 100644 scanpipe/tests/data/aports/community/A/APKBUILD create mode 100644 scanpipe/tests/data/aports/community/D/APKBUILD create mode 100644 scanpipe/tests/data/aports/community/E/NOTAPKBUILD create mode 100644 scanpipe/tests/data/aports/example/C/APKBUILD create mode 100644 scanpipe/tests/data/example_scan_summary.json diff --git a/scanpipe/tests/data/aports/community/A/APKBUILD b/scanpipe/tests/data/aports/community/A/APKBUILD new file mode 100644 index 000000000..e69de29bb diff --git a/scanpipe/tests/data/aports/community/D/APKBUILD b/scanpipe/tests/data/aports/community/D/APKBUILD new file mode 100644 index 000000000..e69de29bb diff --git a/scanpipe/tests/data/aports/community/E/NOTAPKBUILD b/scanpipe/tests/data/aports/community/E/NOTAPKBUILD new file mode 100644 index 000000000..e69de29bb diff --git a/scanpipe/tests/data/aports/example/C/APKBUILD b/scanpipe/tests/data/aports/example/C/APKBUILD new file mode 100644 index 000000000..e69de29bb diff --git a/scanpipe/tests/data/example_scan_summary.json b/scanpipe/tests/data/example_scan_summary.json new file mode 100644 index 000000000..4a4f902b9 --- /dev/null +++ b/scanpipe/tests/data/example_scan_summary.json @@ -0,0 +1,34 @@ +{ + "summary": { + "copyrights": [ + { + "value": "Copyright (c) A B", + "count": 51 + }, + { + "value": "Copyright (c) C D", + "count": 8 + } + ], + "holders": [ + { + "value": "A B", + "count": 51 + }, + { + "value": "C D", + "count": 41 + } + ], + "authors": [ + { + "value": "A B", + "count": 2 + }, + { + "value": "C D", + "count": 1 + } + ] + } +} diff --git a/scanpipe/tests/test_pipes.py b/scanpipe/tests/test_pipes.py index 0e5dfb89d..32ea59a1f 100644 --- a/scanpipe/tests/test_pipes.py +++ b/scanpipe/tests/test_pipes.py @@ -38,6 +38,7 @@ from scanpipe.models import CodebaseResource from scanpipe.models import DiscoveredPackage from scanpipe.models import Project +from scanpipe.pipes import alpine from scanpipe.pipes import codebase from scanpipe.pipes import docker from scanpipe.pipes import fetch @@ -756,6 +757,102 @@ def test_scanpipe_pipes_rootfs_has_hash_diff(self): codebase_resource = CodebaseResource(sha256="sha256", md5="md5") self.assertFalse(rootfs.has_hash_diff(install_file, codebase_resource)) + @mock.patch("scanpipe.pipes.alpine.fetch_via_git") + def test_scanpipe_pipes_alpine_download_or_checkout_aports(self, fetch_via_git): + example_path = Path() + aports_path = str(example_path / alpine.APORTS_DIR_NAME) + + alpine.download_or_checkout_aports( + aports_dir_path=example_path, alpine_version="3.13.14" + ) + fetch_via_git.assert_called_with( + url=f"git+{alpine.APORTS_URL}@3.13-stable", location=aports_path + ) + + alpine.download_or_checkout_aports( + aports_dir_path=example_path, alpine_version="3.13.14", commit_id="1" + ) + fetch_via_git.assert_called_with( + url=f"git+{alpine.APORTS_URL}@1", location=aports_path + ) + + def test_scanpipe_pipes_alpine_get_unscanned_packages_from_db(self): + project = Project.objects.create(name="example") + alpine_versions = {"1": "3.12", "2": "3.13"} + package_field_names = ( + "type", + "name", + "version", + "vcs_url", + "source_packages", + "extra_data", + ) + package_data = [ + ("debian",), + ("rpm",), + ("alpine", "A", "1.0", "id=A", [], {"image_id": "1"}), + ("alpine", "B", "1.0", "id=B", [], {"image_id": "2"}), + ] + # The test will get bigger (thus arrays and loops instead of consecutive function calls) - futher patches for this function expected + expected_package_tuples = [ + ( + "3.13", + "B", + project.tmp_path / "B_1.0", + project.output_path / "B_1.0.json", + ), + ] + (project.output_path / "A_1.0.json").touch() + for package_data_tuple in package_data: + DiscoveredPackage.objects.create( + project=project, **dict(zip(package_field_names, package_data_tuple)) + ) + yielded_package_tuples = alpine.get_unscanned_packages_from_db( + project=project, alpine_versions=alpine_versions + ) + for i, package_tuple in enumerate(yielded_package_tuples): + self.assertEqual(expected_package_tuples[i], package_tuple[:4]) + + @mock.patch("scanpipe.pipes.alpine.alpine.parse_apkbuild") + @mock.patch("scanpipe.pipes.alpine.copytree") + def test_scanpipe_pipes_alpine_prepare_scan_dir(self, copytree, parse_apkbuild): + example_path = Path() + + aports_path = self.data_location / alpine.APORTS_DIR_NAME + (aports_path / "main" / "A").mkdir(parents=True, exist_ok=True) + (aports_path / "non-free" / "A").mkdir(parents=True, exist_ok=True) + (aports_path / "community" / "B").mkdir(parents=True, exist_ok=True) + + package_test_cases = [ + ("A", None), + ("B", None), + ("C", None), + ("D", example_path), + ("E", example_path), + ] + + for test_case in package_test_cases: + returned_value = alpine.prepare_scan_dir( + package_name=test_case[0], + scan_target_path=example_path, + aports_dir_path=self.data_location, + ) + self.assertEqual(returned_value, test_case[1]) + + def test_scanpipe_pipes_alpine_extract_summary_fields(self): + returned_value = alpine.extract_summary_fields( + self.data_location / "example_scan_summary.json", + ["copyrights", "holders", "authors"], + ) + self.assertEqual( + returned_value, + { + "copyrights": ["Copyright (c) A B", "Copyright (c) C D"], + "holders": ["A B", "C D"], + "authors": ["A B", "C D"], + }, + ) + class ScanPipePipesTransactionTest(TransactionTestCase): """