Skip to content

Commit 26d89a6

Browse files
committed
Add AlpinePackages pipeline
A pipeline that complements missing package data. Downloads aports repository and all its necessary branches (alpine versions) then iterates over all alpine packages associated with the pipeline's project. For each package it copies additional files from the aports repository into scan target directory then downloads and extract all the source archives, performs a scan and saves it's output to package's database entry. Signed-off-by: Mateusz Perc <[email protected]>
1 parent e574fa9 commit 26d89a6

File tree

4 files changed

+181
-0
lines changed

4 files changed

+181
-0
lines changed

scanpipe/pipelines/alpine_packages.py

+95
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
#
3+
# http://nexb.com and https://github.com/nexB/scancode.io
4+
# The ScanCode.io software is licensed under the Apache License version 2.0.
5+
# Data generated with ScanCode.io is provided as-is without warranties.
6+
# ScanCode is a trademark of nexB Inc.
7+
#
8+
# You may not use this software except in compliance with the License.
9+
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
10+
# Unless required by applicable law or agreed to in writing, software distributed
11+
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12+
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
# specific language governing permissions and limitations under the License.
14+
#
15+
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
16+
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
17+
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
18+
# for any legal advice.
19+
#
20+
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
21+
# Visit https://github.com/nexB/scancode.io for support and download.
22+
23+
from scanpipe.pipelines import Pipeline
24+
from scanpipe.pipes.alpine import (
25+
download_or_checkout_aports,
26+
get_packages_from_db,
27+
prepare_scan_dir,
28+
extract_summary_fields,
29+
)
30+
from scanpipe.pipes.scancode import run_extractcode, run_scancode
31+
32+
33+
class AlpinePackages(Pipeline):
34+
"""
35+
A pipeline to complement missing alpine package data.
36+
Downloads and extracts needed information from aports repository and packages source files.
37+
"""
38+
39+
@classmethod
40+
def steps(cls):
41+
return (
42+
cls.create_alpine_versions_dict,
43+
cls.download_aports_repo,
44+
cls.complement_missing_packages_data,
45+
)
46+
47+
scancode_options = ["--copyright", "--summary"]
48+
49+
def create_alpine_versions_dict(self):
50+
"""
51+
Create a dict, mapping alpine image ids from the database to alpine versions.
52+
"""
53+
self.alpine_versions = {
54+
i["image_id"]: i["distro"]["version_id"]
55+
for i in self.project.extra_data["images"]
56+
if i["distro"]["identifier"] == "alpine"
57+
}
58+
59+
def download_aports_repo(self):
60+
"""
61+
Iterate over every alpine version associated with this project.
62+
Download corresponding aports repository branches (alpine versions).
63+
"""
64+
for image_id in self.alpine_versions:
65+
download_or_checkout_aports(
66+
self.project.tmp_path, self.alpine_versions[image_id]
67+
)
68+
69+
def complement_missing_packages_data(self):
70+
"""
71+
Iterate over alpine packages associated with this project.
72+
Checkout aports repository to the corresponding alpine version and commit.
73+
Prepare scan target directory, download and extract package's sources.
74+
Run scancode and extract missing data (only copyrights for now).
75+
Update and save package's missing data to database.
76+
"""
77+
for scan_target_path, scan_result_path, package in get_packages_from_db(
78+
self.project
79+
):
80+
if (
81+
not download_or_checkout_aports(
82+
self.project.tmp_path,
83+
self.alpine_versions[package.extra_data["image_id"]],
84+
package.vcs_url.split("id=")[1],
85+
)
86+
or not prepare_scan_dir(package.name, scan_target_path)
87+
):
88+
continue
89+
run_extractcode(str(scan_target_path))
90+
run_scancode(
91+
str(scan_target_path), str(scan_result_path), self.scancode_options
92+
)
93+
package.update_extra_data(
94+
extract_summary_fields(scan_result_path, ["copyrights"])
95+
)

scanpipe/pipes/alpine.py

+84
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,91 @@
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/nexB/scancode.io for support and download.
2222

23+
import json
24+
2325
from packagedcode import alpine
26+
from fetchcode.vcs.git import fetch_via_git
27+
from fetchcode import fetch
28+
from shutil import copytree
29+
from scanpipe.models import DiscoveredPackage
30+
31+
APORTS_URL = "https://gitlab.alpinelinux.org/alpine/aports.git"
32+
APORTS_DIR_NAME = "aports"
33+
APORTS_SUBDIRS = ["main", "non-free", "community", "testing", "unmaintained"]
34+
35+
36+
def download_or_checkout_aports(aports_dir_path, alpine_version, commit_id=None):
37+
"""
38+
Download aports repository and it's branch based on `alpine_version`.
39+
Checkout to a branch (alpine version).
40+
If `commit_id` is provided also checkout to a commit.
41+
Return False if checkout failed otherwise True. #TODO Proper fetchcode patch required (extending #54)
42+
"""
43+
ver = alpine_version.split(".")
44+
aports_dir_path = str(aports_dir_path / APORTS_DIR_NAME)
45+
fetch_via_git(f"git+{APORTS_URL}@{ver[0]}.{ver[1]}-stable", aports_dir_path)
46+
if commit_id:
47+
fetch_via_git(f"git+{APORTS_URL}@{commit_id}", aports_dir_path)
48+
return True
49+
50+
51+
def get_packages_from_db(project):
52+
"""
53+
Get alpine packages from the database that belong to a `project`
54+
Yield only root and not yet scanned packages along with paths needed when performing a scan.
55+
"""
56+
for package in DiscoveredPackage.objects.project(project):
57+
scan_id = f"{package.name}_{package.version}"
58+
scan_target_path = project.tmp_path / scan_id
59+
scan_result_path = project.output_path / (scan_id + ".json")
60+
if (
61+
package.type == "alpine"
62+
or (package.source_packages and package.source_packages[0] in package.purl)
63+
or not scan_result_path.exists()
64+
):
65+
yield scan_target_path, scan_result_path, package
66+
67+
68+
def prepare_scan_dir(package_name, scan_target_path, aports_dir_path=None):
69+
"""
70+
Find package's aports path and if found execute the following steps:
71+
Copy all the files from that path into `scan_target_path`
72+
Download all package's sources into `scan_target_path`
73+
The default value of `aports_dir_path` is set to the parent of the `scan_target_path`
74+
"""
75+
if aports_dir_path is None:
76+
aports_dir_path = scan_target_path.parent
77+
for subdir_name in APORTS_SUBDIRS:
78+
apkbuild_dir = aports_dir_path / APORTS_DIR_NAME / subdir_name / package_name
79+
if not apkbuild_dir.exists():
80+
continue
81+
copytree(apkbuild_dir, scan_target_path)
82+
for source in (
83+
alpine.parse_apkbuild(scan_target_path / "APKBUILD")
84+
.to_dict()
85+
.get("extra_data")
86+
.get("sources")
87+
or []
88+
):
89+
if source["url"]:
90+
fetch(source["url"], scan_target_path)
91+
return True
92+
93+
94+
def extract_summary_fields(scan_result_path, summary_fields):
95+
"""
96+
Having a scancode result file extract all the 'summary_fields' values from the `summary` section.
97+
Return an object mapping 'summary_fields' to the arrays of values mentioned above.
98+
"""
99+
with open(scan_result_path) as scan_result:
100+
json_obj = json.load(scan_result)
101+
result_obj = {}
102+
for field in summary_fields:
103+
result_obj[field] = []
104+
for field_element in json_obj["summary"][field]:
105+
if field_element["value"]:
106+
result_obj[field].append(field_element["value"])
107+
return result_obj
24108

25109

26110
def package_getter(root_dir, **kwargs):

scanpipe/pipes/docker.py

+1
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ def scan_image_for_system_packages(project, image, detect_licenses=True):
123123
for i, (purl, package, layer) in enumerate(installed_packages):
124124
logger.info(f"Creating package #{i}: {purl}")
125125
created_package = pipes.update_or_create_package(project, package.to_dict())
126+
created_package.update_extra_data({"image_id": image.image_id})
126127

127128
# We have no files for this installed package, we cannot go further.
128129
if not package.installed_files:

setup.py

+1
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
"root_filesystems = scanpipe.pipelines.root_filesystems:RootFS",
6262
"scan_codebase = scanpipe.pipelines.scan_codebase:ScanCodebase",
6363
"scan_package = scanpipe.pipelines.scan_package:ScanPackage",
64+
"alpine_packages = scanpipe.pipelines.alpine_packages:AlpinePackages"
6465
],
6566
},
6667
classifiers=[

0 commit comments

Comments
 (0)