Skip to content

Commit be62961

Browse files
committed
Add AlpinePackages pipeline
A pipeline that complements missing package data. Downloads aports repository and all its necessary branches (alpine versions) then iterates over all alpine packages associated with the pipeline's project. For each package it copies additional files from the aports repository into scan target directory then downloads and extract all the source archives, performs a scan and saves it's output to package's database entry. Signed-off-by: Mateusz Perc <[email protected]>
1 parent e574fa9 commit be62961

File tree

4 files changed

+215
-0
lines changed

4 files changed

+215
-0
lines changed

scanpipe/pipelines/alpine_packages.py

+107
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
#
3+
# http://nexb.com and https://github.com/nexB/scancode.io
4+
# The ScanCode.io software is licensed under the Apache License version 2.0.
5+
# Data generated with ScanCode.io is provided as-is without warranties.
6+
# ScanCode is a trademark of nexB Inc.
7+
#
8+
# You may not use this software except in compliance with the License.
9+
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
10+
# Unless required by applicable law or agreed to in writing, software distributed
11+
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12+
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
# specific language governing permissions and limitations under the License.
14+
#
15+
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
16+
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
17+
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
18+
# for any legal advice.
19+
#
20+
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
21+
# Visit https://github.com/nexB/scancode.io for support and download.
22+
23+
from scanpipe.pipelines import Pipeline
24+
from scanpipe.pipes.alpine import download_or_checkout_aports
25+
from scanpipe.pipes.alpine import extract_summary_fields
26+
from scanpipe.pipes.alpine import get_unscanned_packages_from_db
27+
from scanpipe.pipes.alpine import prepare_scan_dir
28+
from scanpipe.pipes.scancode import run_extractcode
29+
from scanpipe.pipes.scancode import run_scancode
30+
31+
32+
class AlpinePackages(Pipeline):
33+
"""
34+
A pipeline to complement missing alpine package data.
35+
Downloads and extracts needed information from aports repository and package source files.
36+
Alpine Linux does not provide copyrights and (in some cases) licenses for it's packages.
37+
"""
38+
39+
@classmethod
40+
def steps(cls):
41+
return (
42+
cls.create_alpine_versions_dict,
43+
cls.download_aports_repo,
44+
cls.complement_missing_package_data,
45+
)
46+
47+
scancode_options = ["--copyright", "--summary"]
48+
49+
def create_alpine_versions_dict(self):
50+
"""
51+
Create a dict mapping alpine image ids from the database to alpine versions.
52+
"""
53+
self.alpine_versions = {
54+
i["image_id"]: i["distro"]["version_id"]
55+
for i in self.project.extra_data["images"]
56+
if i["distro"]["identifier"] == "alpine"
57+
}
58+
59+
def download_aports_repo(self):
60+
"""
61+
Set pipeline's `aports_dir_path` variable to it's project temporary path.
62+
Iterate over every alpine version associated with this project.
63+
Download corresponding aports repository branches (alpine versions).
64+
"""
65+
self.aports_dir_path = self.project.tmp_path
66+
for image_id, alpine_version in self.alpine_versions.items():
67+
download_or_checkout_aports(
68+
aports_dir_path=self.project.tmp_path, alpine_version=alpine_version
69+
)
70+
71+
def complement_missing_package_data(self):
72+
"""
73+
Iterate over alpine packages associated with this project.
74+
Checkout aports repository to the corresponding alpine version and a commit.
75+
Prepare scan target directory - download and extract package's sources.
76+
Run scancode and extract missing data (only copyrights for now).
77+
Update and save package's missing data to database.
78+
"""
79+
for (
80+
alpine_version,
81+
commit_id,
82+
scan_target_path,
83+
scan_result_path,
84+
package,
85+
) in get_unscanned_packages_from_db(
86+
project=self.project, alpine_versions=self.alpine_versions
87+
):
88+
if not download_or_checkout_aports(
89+
aports_dir_path=self.aports_dir_path,
90+
alpine_version=alpine_version,
91+
commit_id=commit_id,
92+
) or not prepare_scan_dir(
93+
package_name=package.name, scan_target_path=scan_target_path
94+
):
95+
continue
96+
run_extractcode(location=str(scan_target_path))
97+
run_scancode(
98+
location=str(scan_target_path),
99+
output_file=str(scan_result_path),
100+
options=self.scancode_options,
101+
)
102+
package.update_extra_data(
103+
data=extract_summary_fields(
104+
scan_result_path=scan_result_path,
105+
summary_field_names=["copyrights"],
106+
)
107+
)

scanpipe/pipes/alpine.py

+106
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,114 @@
2020
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
2121
# Visit https://github.com/nexB/scancode.io for support and download.
2222

23+
24+
import json
25+
from shutil import copytree
26+
27+
from fetchcode import fetch
28+
from fetchcode.vcs.git import fetch_via_git
2329
from packagedcode import alpine
2430

31+
from scanpipe.models import DiscoveredPackage
32+
33+
APORTS_URL = "https://gitlab.alpinelinux.org/alpine/aports.git"
34+
APORTS_DIR_NAME = "aports"
35+
APORTS_SUBDIRS = ["main", "non-free", "testing", "community", "unmaintained"]
36+
37+
38+
def download_or_checkout_aports(aports_dir_path, alpine_version, commit_id=None):
39+
"""
40+
Download aports repository and it's branch based on `alpine_version`.
41+
Checkout to a branch (alpine version).
42+
If `commit_id` is provided also checkout to a commit.
43+
Return `aports_dir_path` if checkout(s) succeded. #TODO Proper fetchcode patch required (extending #54)
44+
"""
45+
major, minor = alpine_version.split(".")[:2]
46+
aports_dir_path = str(aports_dir_path / APORTS_DIR_NAME)
47+
fetch_via_git(
48+
url=f"git+{APORTS_URL}@{major}.{minor}-stable", location=aports_dir_path
49+
)
50+
if commit_id:
51+
fetch_via_git(url=f"git+{APORTS_URL}@{commit_id}", location=aports_dir_path)
52+
return aports_dir_path
53+
54+
55+
def get_unscanned_packages_from_db(project, alpine_versions):
56+
"""
57+
Return an iterator of 5-tuples (alpine_version, commit_id, scan_target_path, scan_result_path, package) where:
58+
`alpine_version` is an alpine version from which a package comes from (obtained from `alpine_versions` dict),
59+
`commit_id` is an id of aports repository commit that added corresponding version of a package,
60+
`scan_target_path` is a path of the directory on which a scan will be performed,
61+
`scan_result_path` is a path of the scan result json file,
62+
`package` is a DiscoveredPackage instance that belongs to a `project` with an alpine package type.
63+
The returned iterator contains not-a-subpackage alpine packages that don't have an existing scan result file.
64+
"""
65+
for package in DiscoveredPackage.objects.filter(project=project, type="alpine"):
66+
scan_id = f"{package.name}_{package.version}"
67+
scan_result_path = project.output_path / (scan_id + ".json")
68+
alpine_version = alpine_versions.get(package.extra_data["image_id"])
69+
commit_id = package.vcs_url.split("id=")[1]
70+
scan_target_path = project.tmp_path / scan_id
71+
not_a_subpackage = (
72+
not package.source_packages or package.source_packages[0] in package.purl
73+
)
74+
scan_result_nonexistent = not scan_result_path.exists()
75+
if not_a_subpackage and scan_result_nonexistent:
76+
yield alpine_version, commit_id, scan_target_path, scan_result_path, package
77+
78+
79+
def prepare_scan_dir(package_name, scan_target_path, aports_dir_path=None):
80+
"""
81+
A function to gather all the package's source files in `scan_target_path`.
82+
Source files of an alpine package are obtained from it's aports directory whose location has to be guessed.
83+
Such directory is present in one of the five aports repository subdirectories (main, non-free, testing, community, unmaintained).
84+
It's name is the same as the value of the corresponding package's `name` field (hence the `package_name` parameter).
85+
Here are some path examples:
86+
.../aports/main/acf-db
87+
.../aports/non-free/mongodb
88+
Inside, there are some extra files (patches) and an APKBUILD which contains urls to source tarballs.
89+
The function copies all these files (including APKBUILD) and downloads all the source tarballs to `scan_target_path`.
90+
The default value of `aports_dir_path` is set to the parent of the `scan_target_path`.
91+
If the package's aports path is found/guessed and it's also not empty the returned value is `scan_target_path`.
92+
"""
93+
if aports_dir_path is None:
94+
aports_dir_path = scan_target_path.parent
95+
for subdir_name in APORTS_SUBDIRS:
96+
apkbuild_dir = aports_dir_path / APORTS_DIR_NAME / subdir_name / package_name
97+
if not apkbuild_dir.exists():
98+
continue
99+
if not any(apkbuild_dir.iterdir()):
100+
break
101+
copytree(apkbuild_dir, scan_target_path)
102+
package_sources = (
103+
alpine.parse_apkbuild(scan_target_path / "APKBUILD")
104+
.to_dict()
105+
.get("extra_data")
106+
.get("sources")
107+
or []
108+
)
109+
for source in package_sources:
110+
source_url = source.get("url")
111+
if source_url:
112+
fetch(source_url, scan_target_path)
113+
return scan_target_path
114+
115+
116+
def extract_summary_fields(scan_result_path, summary_field_names):
117+
"""
118+
Having a scancode result file extract all the values from the `summary` section of the scan result file (`scan_result_path`).
119+
Put them in the arrays inside the `result` object (result[`field_name`]).
120+
Return `result`.
121+
"""
122+
scan_result = open(scan_result_path)
123+
summaries = json.load(scan_result)["summary"]
124+
scan_result.close()
125+
result = {}
126+
for field_name in summary_field_names:
127+
values = (summary["value"] for summary in summaries.get(field_name, []))
128+
result[field_name] = [v for v in values if v]
129+
return result
130+
25131

26132
def package_getter(root_dir, **kwargs):
27133
"""

scanpipe/pipes/docker.py

+1
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ def scan_image_for_system_packages(project, image, detect_licenses=True):
122122

123123
for i, (purl, package, layer) in enumerate(installed_packages):
124124
logger.info(f"Creating package #{i}: {purl}")
125+
package.extra_data = {"image_id": image.image_id}
125126
created_package = pipes.update_or_create_package(project, package.to_dict())
126127

127128
# We have no files for this installed package, we cannot go further.

setup.py

+1
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
"root_filesystems = scanpipe.pipelines.root_filesystems:RootFS",
6262
"scan_codebase = scanpipe.pipelines.scan_codebase:ScanCodebase",
6363
"scan_package = scanpipe.pipelines.scan_package:ScanPackage",
64+
"alpine_packages = scanpipe.pipelines.alpine_packages:AlpinePackages"
6465
],
6566
},
6667
classifiers=[

0 commit comments

Comments
 (0)