Skip to content

Commit 9e54fcb

Browse files
Merge branch 'main' into minecode-pipeline-npm
Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
2 parents 65ae71f + 9c875b5 commit 9e54fcb

33 files changed

+2263
-186
lines changed

minecode_pipelines/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,4 @@
88
#
99

1010

11-
VERSION = "0.0.1b17"
11+
VERSION = "0.0.1b23"
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# purldb is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/aboutcode-org/purldb for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
import json
11+
from minecode_pipelines.utils import get_temp_file
12+
import requests
13+
from packageurl import PackageURL
14+
15+
16+
def get_composer_packages():
17+
"""
18+
Fetch all Composer packages from Packagist and save them to a temporary JSON file.
19+
Response example:
20+
{
21+
"packageNames" ["0.0.0/composer-include-files", "0.0.0/laravel-env-shim"]
22+
}
23+
"""
24+
25+
response = requests.get("https://packagist.org/packages/list.json")
26+
if not response.ok:
27+
return
28+
29+
packages = response.json()
30+
temp_file = get_temp_file("ComposerPackages", "json")
31+
with open(temp_file, "w", encoding="utf-8") as f:
32+
json.dump(packages, f, indent=4)
33+
34+
return temp_file
35+
36+
37+
def get_composer_purl(vendor, package):
38+
"""
39+
Fetch all available Package URLs (purls) for a Composer package from Packagist.
40+
Response example:
41+
{
42+
"minified": "composer/2.0",
43+
"packages": [
44+
{
45+
"monolog/monolog": {
46+
"0": {
47+
"name": "monolog/monolog",
48+
"version": "3.9.0"
49+
}
50+
}
51+
}
52+
],
53+
"security-advisories": [
54+
{
55+
"advisoryId": "PKSA-dmw8-jd8k-q3c6",
56+
"affectedVersions": ">=1.8.0,<1.12.0"
57+
}
58+
]
59+
}
60+
get_composer_purl("monolog", "monolog")
61+
-> ["pkg:composer/monolog/[email protected]", "pkg:composer/monolog/[email protected]", ...]
62+
"""
63+
purls = []
64+
url = f"https://repo.packagist.org/p2/{vendor}/{package}.json"
65+
66+
try:
67+
response = requests.get(url, timeout=10)
68+
response.raise_for_status()
69+
except requests.RequestException:
70+
return purls
71+
72+
data = response.json()
73+
packages = data.get("packages", {})
74+
releases = packages.get(f"{vendor}/{package}", [])
75+
76+
for release in releases:
77+
version = release.get("version")
78+
if version:
79+
purl = PackageURL(
80+
type="composer",
81+
namespace=vendor,
82+
name=package,
83+
version=version,
84+
)
85+
purls.append(purl.to_string())
86+
87+
return purls
88+
89+
90+
def load_composer_packages(packages_file):
91+
"""Load and return a list of (vendor, package) tuples from a JSON file."""
92+
with open(packages_file, encoding="utf-8") as f:
93+
packages_data = json.load(f)
94+
95+
package_names = packages_data.get("packageNames", [])
96+
result = []
97+
98+
for item in package_names:
99+
if "/" in item:
100+
vendor, package = item.split("/", 1)
101+
result.append((vendor, package))
102+
103+
return result

minecode_pipelines/miners/cran.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# purldb is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/aboutcode-org/purldb for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
import json
10+
from pathlib import Path
11+
import requests
12+
from packageurl import PackageURL
13+
14+
15+
def fetch_cran_db(output_file="cran_db.json") -> Path:
16+
"""
17+
Download the CRAN package database (~250MB JSON) in a memory-efficient way.
18+
Saves it to a file instead of loading everything into memory.
19+
"""
20+
21+
url = "https://crandb.r-pkg.org/-/all"
22+
output_path = Path(output_file)
23+
24+
with requests.get(url, stream=True) as response:
25+
response.raise_for_status()
26+
with output_path.open("wb") as f:
27+
for chunk in response.iter_content(chunk_size=8192):
28+
f.write(chunk)
29+
30+
return output_path
31+
32+
33+
def extract_cran_packages(json_file_path: str) -> list:
34+
"""
35+
Extract package names and their versions from a CRAN DB JSON file.
36+
ex:
37+
{
38+
"AATtools": {
39+
"_id": "AATtools",
40+
"_rev": "8-9ebb721d05b946f2b437b49e892c9e8c",
41+
"name": "AATtools",
42+
"versions": {
43+
"0.0.1": {...},
44+
"0.0.2": {...},
45+
"0.0.3": {...}
46+
}
47+
}
48+
"""
49+
db_path = Path(json_file_path)
50+
if not db_path.exists():
51+
raise FileNotFoundError(f"File not found: {db_path}")
52+
53+
with open(db_path, encoding="utf-8") as f:
54+
data = json.load(f)
55+
56+
for pkg_name, pkg_data in data.items():
57+
versions = list(pkg_data.get("versions", {}).keys())
58+
purls = []
59+
for version in versions:
60+
purl = PackageURL(
61+
type="cran",
62+
name=pkg_name,
63+
version=version,
64+
)
65+
purls.append(purl.to_string())
66+
yield purls

minecode_pipelines/miners/swift.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# purldb is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/aboutcode-org/purldb for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
import shutil
10+
import subprocess
11+
from urllib.parse import urlparse
12+
13+
"""
14+
Clone the Swift Index repo (https://github.com/SwiftPackageIndex/PackageList) and the Minecode Pipelines Swift repo.
15+
Read the packages.json file from the Swift Index repo to get a list of Git repositories.
16+
Fetch the tags for each repo using the git ls-remote command,
17+
then create package URLs for each repo with its version and store them in the Minecode Pipelines Swift repo.
18+
"""
19+
20+
21+
def is_safe_repo_url(repo_url: str) -> bool:
22+
"""Return True if the URL is HTTPS GitHub with .git suffix or has at least two path segments."""
23+
parsed = urlparse(repo_url)
24+
return (
25+
parsed.scheme == "https"
26+
and parsed.netloc == "github.com"
27+
and parsed.path.endswith(".git")
28+
or parsed.path.count("/") >= 2
29+
)
30+
31+
32+
def fetch_git_tags_raw(repo_url: str, timeout: int = 60, logger=None) -> str | None:
33+
"""Run `git ls-remote` on a GitHub repo and return raw output, or None on error."""
34+
git_executable = shutil.which("git")
35+
if git_executable is None:
36+
logger("Git executable not found in PATH")
37+
return None
38+
39+
if not is_safe_repo_url(repo_url):
40+
raise ValueError(f"Unsafe repo URL: {repo_url}")
41+
42+
try:
43+
result = subprocess.run(
44+
[git_executable, "ls-remote", repo_url],
45+
capture_output=True,
46+
text=True,
47+
check=True,
48+
timeout=timeout,
49+
)
50+
return result.stdout.strip()
51+
except subprocess.CalledProcessError as e:
52+
logger(f"Failed to fetch tags for {repo_url}: {e}")
53+
except subprocess.TimeoutExpired:
54+
logger(f"Timeout fetching tags for {repo_url}")
55+
return None
56+
57+
58+
# FIXME duplicated with miners github
59+
def split_org_repo(url_like):
60+
"""
61+
Given a URL-like string to a GitHub repo or a repo name as in org/name,
62+
split and return the org and name.
63+
64+
For example:
65+
>>> split_org_repo('foo/bar')
66+
('foo', 'bar')
67+
>>> split_org_repo('https://api.github.com/repos/foo/bar/')
68+
('foo', 'bar')
69+
>>> split_org_repo('github.com/foo/bar/')
70+
('foo', 'bar')
71+
>>> split_org_repo('git://github.com/foo/bar.git')
72+
('foo', 'bar')
73+
"""
74+
segments = [s.strip() for s in url_like.split("/") if s.strip()]
75+
if not len(segments) >= 2:
76+
raise ValueError(f"Not a GitHub-like URL: {url_like}")
77+
org = segments[-2]
78+
name = segments[-1]
79+
if name.endswith(".git"):
80+
name, _, _ = name.rpartition(".git")
81+
return org, name
82+
83+
84+
# FIXME duplicated with purl2vcs.find_source_repo.get_tags_and_commits_from_git_output
85+
def get_tags_and_commits_from_git_output(git_ls_remote):
86+
"""
87+
Yield tuples of (tag, commit), given a git ls-remote output
88+
"""
89+
for line in git_ls_remote.split("\n"):
90+
# line: kjwfgeklngelkfjofjeo123 refs/tags/1.2.3
91+
line_segments = line.split("\t")
92+
# segments: ["kjwfgeklngelkfjofjeo123", "refs/tags/1.2.3"]
93+
if len(line_segments) > 1 and line_segments[1].startswith("refs/tags/"):
94+
commit = line_segments[0]
95+
tag = line_segments[1].replace("refs/tags/", "")
96+
yield tag, commit
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
#
3+
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
4+
# The ScanCode.io software is licensed under the Apache License version 2.0.
5+
# Data generated with ScanCode.io is provided as-is without warranties.
6+
# ScanCode is a trademark of nexB Inc.
7+
#
8+
# You may not use this software except in compliance with the License.
9+
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
10+
# Unless required by applicable law or agreed to in writing, software distributed
11+
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12+
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
# specific language governing permissions and limitations under the License.
14+
#
15+
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
16+
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
17+
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
18+
# for any legal advice.
19+
#
20+
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
21+
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
22+
23+
import os
24+
from scanpipe.pipelines import Pipeline
25+
from scanpipe.pipes import federatedcode
26+
27+
from minecode_pipelines import pipes
28+
from minecode_pipelines.pipes import MINECODE_PIPELINES_CONFIG_REPO
29+
from minecode_pipelines.pipes.composer import mine_composer_packages
30+
from minecode_pipelines.pipes.composer import mine_and_publish_composer_purls
31+
32+
MINECODE_COMPOSER_GIT_URL = os.environ.get(
33+
"MINECODE_COMPOSER_GIT_URL", "https://github.com/aboutcode-data/minecode-data-composer-test"
34+
)
35+
36+
37+
class MineComposer(Pipeline):
38+
"""
39+
Mine all packageURLs from a composer index and publish them to a FederatedCode repo.
40+
"""
41+
42+
@classmethod
43+
def steps(cls):
44+
return (
45+
cls.check_federatedcode_eligibility,
46+
cls.clone_composer_repo,
47+
cls.mine_and_publish_composer_purls,
48+
)
49+
50+
def check_federatedcode_eligibility(self):
51+
"""
52+
Check if the project fulfills the following criteria for
53+
pushing the project result to FederatedCode.
54+
"""
55+
federatedcode.check_federatedcode_configured_and_available(logger=self.log)
56+
57+
def clone_composer_repo(self):
58+
"""
59+
Clone the federatedcode composer url and return the Repo object
60+
"""
61+
self.cloned_data_repo = federatedcode.clone_repository(MINECODE_COMPOSER_GIT_URL)
62+
self.cloned_config_repo = federatedcode.clone_repository(MINECODE_PIPELINES_CONFIG_REPO)
63+
64+
def mine_and_publish_composer_purls(self):
65+
"""
66+
Mine Composer package names from Composer indexes and generate
67+
package URLs (pURLs) for all mined Composer packages.
68+
"""
69+
70+
composer_packages = mine_composer_packages()
71+
mine_and_publish_composer_purls(
72+
packages=composer_packages,
73+
cloned_data_repo=self.cloned_data_repo,
74+
cloned_config_repo=self.cloned_config_repo,
75+
logger=self.log,
76+
)
77+
78+
def delete_cloned_repos(self):
79+
pipes.delete_cloned_repos(
80+
repos=[self.cloned_data_repo, self.cloned_config_repo],
81+
logger=self.log,
82+
)

0 commit comments

Comments
 (0)