diff --git a/scanpipe/pipelines/alpine_packages.py b/scanpipe/pipelines/alpine_packages.py new file mode 100644 index 000000000..bbfd6b6a1 --- /dev/null +++ b/scanpipe/pipelines/alpine_packages.py @@ -0,0 +1,107 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/nexB/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/nexB/scancode.io for support and download. + +from scanpipe.pipelines import Pipeline +from scanpipe.pipes.alpine import download_or_checkout_aports +from scanpipe.pipes.alpine import extract_summary_fields +from scanpipe.pipes.alpine import get_unscanned_packages_from_db +from scanpipe.pipes.alpine import prepare_scan_dir +from scanpipe.pipes.scancode import run_extractcode +from scanpipe.pipes.scancode import run_scancode + + +class AlpinePackages(Pipeline): + """ + A pipeline to complement missing alpine package data. + Downloads and extracts needed information from aports repository and package source files. + Alpine Linux does not provide copyrights and (in some cases) licenses for it's packages. + """ + + @classmethod + def steps(cls): + return ( + cls.create_alpine_versions_dict, + cls.download_aports_repo, + cls.complement_missing_package_data, + ) + + scancode_options = ["--copyright", "--summary"] + + def create_alpine_versions_dict(self): + """ + Create a dict mapping alpine image ids from the database to alpine versions. + """ + self.alpine_versions = { + i["image_id"]: i["distro"]["version_id"] + for i in self.project.extra_data["images"] + if i["distro"]["identifier"] == "alpine" + } + + def download_aports_repo(self): + """ + Set pipeline's `aports_dir_path` variable to it's project temporary path. + Iterate over every alpine version associated with this project. + Download corresponding aports repository branches (alpine versions). + """ + self.aports_dir_path = self.project.tmp_path + for image_id, alpine_version in self.alpine_versions.items(): + download_or_checkout_aports( + aports_dir_path=self.project.tmp_path, alpine_version=alpine_version + ) + + def complement_missing_package_data(self): + """ + Iterate over alpine packages associated with this project. + Checkout aports repository to the corresponding alpine version and a commit. + Prepare scan target directory - download and extract package's sources. + Run scancode and extract missing data (only copyrights for now). + Update and save package's missing data to database. + """ + for ( + alpine_version, + commit_id, + scan_target_path, + scan_result_path, + package, + ) in get_unscanned_packages_from_db( + project=self.project, alpine_versions=self.alpine_versions + ): + if not download_or_checkout_aports( + aports_dir_path=self.aports_dir_path, + alpine_version=alpine_version, + commit_id=commit_id, + ) or not prepare_scan_dir( + package_name=package.name, scan_target_path=scan_target_path + ): + continue + run_extractcode(location=str(scan_target_path)) + run_scancode( + location=str(scan_target_path), + output_file=str(scan_result_path), + options=self.scancode_options, + ) + package.update_extra_data( + data=extract_summary_fields( + scan_result_path=scan_result_path, + summary_field_names=["copyrights"], + ) + ) diff --git a/scanpipe/pipes/alpine.py b/scanpipe/pipes/alpine.py index 5340b27d7..3d22dd98c 100644 --- a/scanpipe/pipes/alpine.py +++ b/scanpipe/pipes/alpine.py @@ -20,8 +20,114 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/nexB/scancode.io for support and download. + +import json +from shutil import copytree + +from fetchcode import fetch +from fetchcode.vcs.git import fetch_via_git from packagedcode import alpine +from scanpipe.models import DiscoveredPackage + +APORTS_URL = "https://gitlab.alpinelinux.org/alpine/aports.git" +APORTS_DIR_NAME = "aports" +APORTS_SUBDIRS = ["main", "non-free", "testing", "community", "unmaintained"] + + +def download_or_checkout_aports(aports_dir_path, alpine_version, commit_id=None): + """ + Download aports repository and it's branch based on `alpine_version`. + Checkout to a branch (alpine version). + If `commit_id` is provided also checkout to a commit. + Return `aports_dir_path` if checkout(s) succeded. #TODO Proper fetchcode patch required (extending #54) + """ + major, minor = alpine_version.split(".")[:2] + aports_dir_path = str(aports_dir_path / APORTS_DIR_NAME) + fetch_via_git( + url=f"git+{APORTS_URL}@{major}.{minor}-stable", location=aports_dir_path + ) + if commit_id: + fetch_via_git(url=f"git+{APORTS_URL}@{commit_id}", location=aports_dir_path) + return aports_dir_path + + +def get_unscanned_packages_from_db(project, alpine_versions): + """ + Return an iterator of 5-tuples (alpine_version, commit_id, scan_target_path, scan_result_path, package) where: + `alpine_version` is an alpine version from which a package comes from (obtained from `alpine_versions` dict), + `commit_id` is an id of aports repository commit that added corresponding version of a package, + `scan_target_path` is a path of the directory on which a scan will be performed, + `scan_result_path` is a path of the scan result json file, + `package` is a DiscoveredPackage instance that belongs to a `project` with an alpine package type. + The returned iterator contains not-a-subpackage alpine packages that don't have an existing scan result file. + """ + for package in DiscoveredPackage.objects.filter(project=project, type="alpine"): + scan_id = f"{package.name}_{package.version}" + scan_result_path = project.output_path / (scan_id + ".json") + alpine_version = alpine_versions.get(package.extra_data["image_id"]) + commit_id = package.vcs_url.split("id=")[1] + scan_target_path = project.tmp_path / scan_id + not_a_subpackage = ( + not package.source_packages or package.source_packages[0] in package.purl + ) + scan_result_nonexistent = not scan_result_path.exists() + if not_a_subpackage and scan_result_nonexistent: + yield alpine_version, commit_id, scan_target_path, scan_result_path, package + + +def prepare_scan_dir(package_name, scan_target_path, aports_dir_path=None): + """ + A function to gather all the package's source files in `scan_target_path`. + Source files of an alpine package are obtained from it's aports directory whose location has to be guessed. + Such directory is present in one of the five aports repository subdirectories (main, non-free, testing, community, unmaintained). + It's name is the same as the value of the corresponding package's `name` field (hence the `package_name` parameter). + Here are some path examples: + .../aports/main/acf-db + .../aports/non-free/mongodb + Inside, there are some extra files (patches) and an APKBUILD which contains urls to source tarballs. + The function copies all these files (including APKBUILD) and downloads all the source tarballs to `scan_target_path`. + The default value of `aports_dir_path` is set to the parent of the `scan_target_path`. + If the package's aports path is found/guessed and it's also not empty the returned value is `scan_target_path`. + """ + if aports_dir_path is None: + aports_dir_path = scan_target_path.parent + for subdir_name in APORTS_SUBDIRS: + apkbuild_dir = aports_dir_path / APORTS_DIR_NAME / subdir_name / package_name + if not apkbuild_dir.exists(): + continue + if not any(apkbuild_dir.iterdir()): + break + copytree(apkbuild_dir, scan_target_path) + package_sources = ( + alpine.parse_apkbuild(scan_target_path / "APKBUILD") + .to_dict() + .get("extra_data") + .get("sources") + or [] + ) + for source in package_sources: + source_url = source.get("url") + if source_url: + fetch(source_url, scan_target_path) + return scan_target_path + + +def extract_summary_fields(scan_result_path, summary_field_names): + """ + Having a scancode result file extract all the values from the `summary` section of the scan result file (`scan_result_path`). + Put them in the arrays inside the `result` object (result[`field_name`]). + Return `result`. + """ + scan_result = open(scan_result_path) + summaries = json.load(scan_result)["summary"] + scan_result.close() + result = {} + for field_name in summary_field_names: + values = (summary["value"] for summary in summaries.get(field_name, [])) + result[field_name] = [v for v in values if v] + return result + def package_getter(root_dir, **kwargs): """ diff --git a/scanpipe/pipes/docker.py b/scanpipe/pipes/docker.py index b82d5f800..8d36614cc 100644 --- a/scanpipe/pipes/docker.py +++ b/scanpipe/pipes/docker.py @@ -122,6 +122,7 @@ def scan_image_for_system_packages(project, image, detect_licenses=True): for i, (purl, package, layer) in enumerate(installed_packages): logger.info(f"Creating package #{i}: {purl}") + package.extra_data = {"image_id": image.image_id} created_package = pipes.update_or_create_package(project, package.to_dict()) # We have no files for this installed package, we cannot go further. diff --git a/scanpipe/tests/data/aports/community/A/APKBUILD b/scanpipe/tests/data/aports/community/A/APKBUILD new file mode 100644 index 000000000..e69de29bb diff --git a/scanpipe/tests/data/aports/community/D/APKBUILD b/scanpipe/tests/data/aports/community/D/APKBUILD new file mode 100644 index 000000000..e69de29bb diff --git a/scanpipe/tests/data/aports/community/E/NOTAPKBUILD b/scanpipe/tests/data/aports/community/E/NOTAPKBUILD new file mode 100644 index 000000000..e69de29bb diff --git a/scanpipe/tests/data/aports/example/C/APKBUILD b/scanpipe/tests/data/aports/example/C/APKBUILD new file mode 100644 index 000000000..e69de29bb diff --git a/scanpipe/tests/data/example_scan_summary.json b/scanpipe/tests/data/example_scan_summary.json new file mode 100644 index 000000000..4a4f902b9 --- /dev/null +++ b/scanpipe/tests/data/example_scan_summary.json @@ -0,0 +1,34 @@ +{ + "summary": { + "copyrights": [ + { + "value": "Copyright (c) A B", + "count": 51 + }, + { + "value": "Copyright (c) C D", + "count": 8 + } + ], + "holders": [ + { + "value": "A B", + "count": 51 + }, + { + "value": "C D", + "count": 41 + } + ], + "authors": [ + { + "value": "A B", + "count": 2 + }, + { + "value": "C D", + "count": 1 + } + ] + } +} diff --git a/scanpipe/tests/test_pipes.py b/scanpipe/tests/test_pipes.py index 0e5dfb89d..32ea59a1f 100644 --- a/scanpipe/tests/test_pipes.py +++ b/scanpipe/tests/test_pipes.py @@ -38,6 +38,7 @@ from scanpipe.models import CodebaseResource from scanpipe.models import DiscoveredPackage from scanpipe.models import Project +from scanpipe.pipes import alpine from scanpipe.pipes import codebase from scanpipe.pipes import docker from scanpipe.pipes import fetch @@ -756,6 +757,102 @@ def test_scanpipe_pipes_rootfs_has_hash_diff(self): codebase_resource = CodebaseResource(sha256="sha256", md5="md5") self.assertFalse(rootfs.has_hash_diff(install_file, codebase_resource)) + @mock.patch("scanpipe.pipes.alpine.fetch_via_git") + def test_scanpipe_pipes_alpine_download_or_checkout_aports(self, fetch_via_git): + example_path = Path() + aports_path = str(example_path / alpine.APORTS_DIR_NAME) + + alpine.download_or_checkout_aports( + aports_dir_path=example_path, alpine_version="3.13.14" + ) + fetch_via_git.assert_called_with( + url=f"git+{alpine.APORTS_URL}@3.13-stable", location=aports_path + ) + + alpine.download_or_checkout_aports( + aports_dir_path=example_path, alpine_version="3.13.14", commit_id="1" + ) + fetch_via_git.assert_called_with( + url=f"git+{alpine.APORTS_URL}@1", location=aports_path + ) + + def test_scanpipe_pipes_alpine_get_unscanned_packages_from_db(self): + project = Project.objects.create(name="example") + alpine_versions = {"1": "3.12", "2": "3.13"} + package_field_names = ( + "type", + "name", + "version", + "vcs_url", + "source_packages", + "extra_data", + ) + package_data = [ + ("debian",), + ("rpm",), + ("alpine", "A", "1.0", "id=A", [], {"image_id": "1"}), + ("alpine", "B", "1.0", "id=B", [], {"image_id": "2"}), + ] + # The test will get bigger (thus arrays and loops instead of consecutive function calls) - futher patches for this function expected + expected_package_tuples = [ + ( + "3.13", + "B", + project.tmp_path / "B_1.0", + project.output_path / "B_1.0.json", + ), + ] + (project.output_path / "A_1.0.json").touch() + for package_data_tuple in package_data: + DiscoveredPackage.objects.create( + project=project, **dict(zip(package_field_names, package_data_tuple)) + ) + yielded_package_tuples = alpine.get_unscanned_packages_from_db( + project=project, alpine_versions=alpine_versions + ) + for i, package_tuple in enumerate(yielded_package_tuples): + self.assertEqual(expected_package_tuples[i], package_tuple[:4]) + + @mock.patch("scanpipe.pipes.alpine.alpine.parse_apkbuild") + @mock.patch("scanpipe.pipes.alpine.copytree") + def test_scanpipe_pipes_alpine_prepare_scan_dir(self, copytree, parse_apkbuild): + example_path = Path() + + aports_path = self.data_location / alpine.APORTS_DIR_NAME + (aports_path / "main" / "A").mkdir(parents=True, exist_ok=True) + (aports_path / "non-free" / "A").mkdir(parents=True, exist_ok=True) + (aports_path / "community" / "B").mkdir(parents=True, exist_ok=True) + + package_test_cases = [ + ("A", None), + ("B", None), + ("C", None), + ("D", example_path), + ("E", example_path), + ] + + for test_case in package_test_cases: + returned_value = alpine.prepare_scan_dir( + package_name=test_case[0], + scan_target_path=example_path, + aports_dir_path=self.data_location, + ) + self.assertEqual(returned_value, test_case[1]) + + def test_scanpipe_pipes_alpine_extract_summary_fields(self): + returned_value = alpine.extract_summary_fields( + self.data_location / "example_scan_summary.json", + ["copyrights", "holders", "authors"], + ) + self.assertEqual( + returned_value, + { + "copyrights": ["Copyright (c) A B", "Copyright (c) C D"], + "holders": ["A B", "C D"], + "authors": ["A B", "C D"], + }, + ) + class ScanPipePipesTransactionTest(TransactionTestCase): """ diff --git a/setup.py b/setup.py index 6acaec642..b1666e5ec 100755 --- a/setup.py +++ b/setup.py @@ -61,6 +61,7 @@ "root_filesystems = scanpipe.pipelines.root_filesystems:RootFS", "scan_codebase = scanpipe.pipelines.scan_codebase:ScanCodebase", "scan_package = scanpipe.pipelines.scan_package:ScanPackage", + "alpine_packages = scanpipe.pipelines.alpine_packages:AlpinePackages" ], }, classifiers=[