Skip to content

Commit c210044

Browse files
Add support to mine npm PackageURLs
Reference: #661 Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
1 parent c468a76 commit c210044

File tree

7 files changed

+637
-43
lines changed

7 files changed

+637
-43
lines changed

minecode_pipelines/miners/npm.py

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# purldb is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/aboutcode-org/purldb for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
11+
import json
12+
import requests
13+
14+
from packageurl import PackageURL
15+
16+
from minecode_pipelines.utils import get_temp_file
17+
from minecode_pipelines.pipes import write_data_to_json_file
18+
19+
"""
20+
Visitors for Npmjs and npmjs-like javascript package repositories.
21+
22+
We have this hierarchy in npm replicate and registry index:
23+
npm projects replicate.npmjs.com (paginated JSON) -> versions at registry.npmjs.org (JSON) -> download urls
24+
25+
See https://github.com/orgs/community/discussions/152515 for information on
26+
the latest replicate.npmjs.com API.
27+
28+
https://replicate.npmjs.com/_all_docs
29+
This NPMJS replicate API serves as an index to get all npm packages and their revision IDs
30+
in paginated queries.
31+
32+
https://replicate.npmjs.com/_changes
33+
This NPMJS replicate API serves as a CHANGELOG of npm packages with update sequneces which
34+
can be fetched in paginated queries.
35+
36+
https://registry.npmjs.org/{namespace/name}
37+
For each npm package, a JSON containing details including the list of all releases
38+
and archives, their URLs, and some metadata for each release.
39+
40+
https://registry.npmjs.org/{namespace/name}/{version}
41+
For each release, a JSON contains details for the released version and all the
42+
downloads available for this release.
43+
"""
44+
45+
46+
47+
NPM_REPLICATE_REPO = "https://replicate.npmjs.com/"
48+
NPM_REGISTRY_REPO = "https://registry.npmjs.org/"
49+
NPM_TYPE = "NPM"
50+
NPM_REPLICATE_BATCH_SIZE = 10000
51+
52+
53+
def get_package_names_last_key(package_data):
54+
55+
names = [
56+
package.get("id")
57+
for package in package_data.get("rows")
58+
]
59+
last_key = package_data.get("rows")[-1].get("key")
60+
return names, last_key
61+
62+
63+
64+
def get_package_names_last_seq(package_data):
65+
66+
names = [
67+
package.get("id")
68+
for package in package_data.get("results")
69+
]
70+
last_seq = package_data.get("last_seq")
71+
return names, last_seq
72+
73+
74+
def get_current_last_seq(replicate_url=NPM_REPLICATE_REPO):
75+
76+
npm_replicate_latest_changes = replicate_url + "_changes?descending=True"
77+
response = requests.get(npm_replicate_latest_changes)
78+
if not response.ok:
79+
return
80+
81+
package_data = response.json()
82+
_package_names, last_seq = get_package_names_last_seq(package_data)
83+
return last_seq
84+
85+
86+
def get_updated_npm_packages(last_seq, replicate_url=NPM_REPLICATE_REPO):
87+
88+
all_package_names = []
89+
i = 0
90+
91+
while True:
92+
93+
print(f"Processing iteration: {i}: changes after seq: {last_seq}")
94+
npm_replicate_changes = replicate_url + "_changes?" + f"limit={NPM_REPLICATE_BATCH_SIZE}" + f"&since={last_seq}"
95+
response = requests.get(npm_replicate_changes)
96+
if not response.ok:
97+
return all_package_names
98+
99+
package_data = response.json()
100+
package_names, last_seq = get_package_names_last_seq(package_data)
101+
all_package_names.extend(package_names)
102+
103+
# We have fetched the last set of changes if True
104+
if len(package_names) < NPM_REPLICATE_BATCH_SIZE:
105+
break
106+
107+
i += 1
108+
109+
return {"packages": all_package_names}, last_seq
110+
111+
112+
def get_npm_packages(replicate_url=NPM_REPLICATE_REPO):
113+
114+
all_package_names = []
115+
116+
npm_replicate_all = replicate_url + "_all_docs?" + f"limit={NPM_REPLICATE_BATCH_SIZE}"
117+
response = requests.get(npm_replicate_all)
118+
if not response.ok:
119+
return all_package_names
120+
121+
package_data = response.json()
122+
package_names, last_key = get_package_names_last_key(package_data)
123+
all_package_names.append(package_names)
124+
125+
total_rows = package_data.get("total_rows")
126+
iterations = int(total_rows / NPM_REPLICATE_BATCH_SIZE) + 1
127+
128+
for i in range(iterations):
129+
130+
npm_replicate_from_id = npm_replicate_all + f'&start_key="{last_key}"'
131+
print(f"Processing iteration: {i}: {npm_replicate_from_id}")
132+
133+
response = requests.get(npm_replicate_from_id)
134+
if not response.ok:
135+
raise Exception(npm_replicate_from_id, response.text)
136+
137+
package_data = response.json()
138+
package_names, last_key = get_package_names_last_key(package_data)
139+
all_package_names.append(package_names)
140+
141+
return {"packages": all_package_names}
142+
143+
144+
def get_npm_packageurls(name, npm_repo=NPM_REGISTRY_REPO):
145+
packageurls = []
146+
147+
project_index_api_url = npm_repo + name
148+
response = requests.get(project_index_api_url)
149+
if not response.ok:
150+
return packageurls
151+
152+
project_data = response.json()
153+
for version in project_data.get("versions"):
154+
purl = PackageURL(
155+
type=NPM_TYPE,
156+
name=name,
157+
version=version,
158+
)
159+
packageurls.append(purl.to_string())
160+
161+
return packageurls
162+
163+
164+
def write_packages_json(packages, name):
165+
temp_file = get_temp_file(name)
166+
write_data_to_json_file(path=temp_file, data=packages)
167+
return temp_file
168+
169+
170+
def load_npm_packages(packages_file):
171+
with open(packages_file) as f:
172+
packages_data = json.load(f)
173+
174+
return packages_data.get("packages", [])

minecode_pipelines/miners/pypi.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,6 @@
1313

1414
from packageurl import PackageURL
1515

16-
from minecode_pipelines.utils import get_temp_file
17-
from minecode_pipelines.pipes import write_data_to_json_file
18-
1916
"""
2017
Visitors for Pypi and Pypi-like Python package repositories.
2118
@@ -52,11 +49,6 @@ def get_pypi_packages(pypi_repo, logger=None):
5249
return response.json()
5350

5451

55-
def write_packages_json(packages, name):
56-
temp_file = get_temp_file(name)
57-
write_data_to_json_file(path=temp_file, data=packages)
58-
return temp_file
59-
6052

6153
def get_pypi_packageurls(name):
6254
packageurls = []
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"last_serial": 0,
3+
"date": null
4+
}
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
#
3+
# http://nexb.com and https://github.com/aboutcode-org/scancode.io
4+
# The ScanCode.io software is licensed under the Apache License version 2.0.
5+
# Data generated with ScanCode.io is provided as-is without warranties.
6+
# ScanCode is a trademark of nexB Inc.
7+
#
8+
# You may not use this software except in compliance with the License.
9+
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
10+
# Unless required by applicable law or agreed to in writing, software distributed
11+
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12+
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
# specific language governing permissions and limitations under the License.
14+
#
15+
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
16+
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
17+
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
18+
# for any legal advice.
19+
#
20+
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
21+
# Visit https://github.com/aboutcode-org/scancode.io for support and download.
22+
23+
from scanpipe.pipelines import Pipeline
24+
from scanpipe.pipes import federatedcode
25+
26+
from minecode_pipelines.pipes import npm
27+
from minecode_pipelines import pipes
28+
29+
30+
class MineandPublishNPMPURLs(Pipeline):
31+
"""
32+
Mine all packageURLs from a npm index and publish them to
33+
a FederatedCode repo.
34+
"""
35+
36+
@classmethod
37+
def steps(cls):
38+
return (
39+
cls.check_federatedcode_eligibility,
40+
cls.mine_npm_packages,
41+
cls.mine_and_publish_npm_packageurls,
42+
cls.delete_cloned_repos,
43+
)
44+
45+
def check_federatedcode_eligibility(self):
46+
"""
47+
Check if the project fulfills the following criteria for
48+
pushing the project result to FederatedCode.
49+
"""
50+
federatedcode.check_federatedcode_configured_and_available(logger=self.log)
51+
52+
def mine_npm_packages(self):
53+
"""Mine npm package names from npm indexes or checkpoint."""
54+
self.npm_packages, self.state, self.last_seq = npm.mine_npm_packages(logger=self.log)
55+
56+
def mine_and_publish_npm_packageurls(self):
57+
"""Get npm packageURLs for all mined npm package names."""
58+
self.repos = npm.mine_and_publish_npm_packageurls(
59+
packages_file=self.npm_packages,
60+
state=self.state,
61+
last_seq=self.last_seq,
62+
logger=self.log,
63+
)
64+
65+
def delete_cloned_repos(self):
66+
pipes.delete_cloned_repos(repos=self.repos, logger=self.log)

minecode_pipelines/pipes/__init__.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@
2020
from scanpipe.pipes.federatedcode import delete_local_clone
2121
from scanpipe.pipes.federatedcode import commit_and_push_changes
2222

23+
from minecode_pipelines.utils import get_temp_file
24+
from minecode_pipelines.pipes import write_data_to_json_file
25+
2326
# states:
2427
# note: a state is null when mining starts
2528
INITIAL_SYNC_STATE = "initial-sync"
@@ -29,6 +32,12 @@
2932
MINECODE_PIPELINES_CONFIG_REPO = "https://github.com/aboutcode-data/minecode-pipelines-config/"
3033

3134

35+
def write_packages_json(packages, name):
36+
temp_file = get_temp_file(name)
37+
write_data_to_json_file(path=temp_file, data=packages)
38+
return temp_file
39+
40+
3241
def fetch_checkpoint_from_github(config_repo, checkpoint_path):
3342
repo_name = config_repo.split("github.com")[-1]
3443
checkpoints_file = (
@@ -81,6 +90,32 @@ def update_mined_packages_in_checkpoint(packages, config_repo, cloned_repo, chec
8190
)
8291

8392

93+
def update_checkpoint_state(
94+
cloned_repo,
95+
state,
96+
checkpoint_path,
97+
config_repo=MINECODE_PIPELINES_CONFIG_REPO,
98+
):
99+
checkpoint = fetch_checkpoint_from_github(
100+
config_repo=config_repo,
101+
checkpoint_path=checkpoint_path,
102+
)
103+
checkpoint["state"] = state
104+
update_checkpoints_in_github(
105+
checkpoint=checkpoint,
106+
cloned_repo=cloned_repo,
107+
path=checkpoint_path,
108+
)
109+
110+
111+
def get_packages_file_from_checkpoint(config_repo, checkpoint_path, name):
112+
packages = fetch_checkpoint_from_github(
113+
config_repo=config_repo,
114+
checkpoint_path=checkpoint_path,
115+
)
116+
return write_packages_json(packages, name=name)
117+
118+
84119
def write_packageurls_to_file(repo, base_dir, packageurls):
85120
purl_file_rel_path = os.path.join(base_dir, PURLS_FILENAME)
86121
purl_file_full_path = Path(repo.working_dir) / purl_file_rel_path

0 commit comments

Comments
 (0)