Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle multiple download URLs per module #306

Merged
merged 2 commits into from
Aug 11, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 45 additions & 35 deletions netkan/netkan/download_counter.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,10 @@ def get_result(self, counts: Optional[Dict[str, int]] = None) -> Dict[str, int]:
if user_repo in self.cache:
count = self.cache[user_repo]
logging.info('Count for %s is %s', ident, count)
counts[ident] = count
if ident in counts:
counts[ident] += count
else:
counts[ident] = count
return counts

def graphql_to_github(self, query: str) -> Dict[str, Any]:
Expand Down Expand Up @@ -168,7 +171,10 @@ def get_result(self, counts: Optional[Dict[str, int]] = None) -> Dict[str, int]:
count = sd_counts.get(sd_id)
if count:
logging.info('Count for %s is %s', identifier, count)
counts[identifier] = count
if identifier in counts:
counts[identifier] += count
else:
counts[identifier] = count
return counts


Expand Down Expand Up @@ -201,7 +207,10 @@ def get_result(self, counts: Optional[Dict[str, int]] = None) -> Dict[str, int]:
result = requests.get(self.IARCHIVE_API + ','.join(self.ids.values()),
timeout=60).json()
for ckan_ident, ia_ident in self.ids.items():
counts[ckan_ident] = result[ia_ident]['all_time']
if ckan_ident in counts:
counts[ckan_ident] += result[ia_ident]['all_time']
else:
counts[ckan_ident] = result[ia_ident]['all_time']
return counts


Expand All @@ -223,39 +232,40 @@ def get_counts(self) -> None:
graph_query = GraphQLQuery(self.github_token)
sd_query = SpaceDockBatchedQuery()
ia_query = InternetArchiveBatchedQuery()
for ckan in self.ckm_repo.all_latest_modules():
for ckan in self.ckm_repo.all_latest_modules(): # pylint: disable=too-many-nested-blocks
if ckan.kind == 'dlc':
continue
try:
url_parse = urllib.parse.urlparse(ckan.download)
if url_parse.netloc == 'github.com':
match = self.GITHUB_PATH_PATTERN.match(url_parse.path)
if match:
# Process GitHub modules together in big batches
graph_query.add(ckan.identifier, *match.groups())
if graph_query.full():
# Run the query
graph_query.get_result(self.counts)
# Clear request list
graph_query.clear()
elif url_parse.netloc == 'spacedock.info':
match = self.SPACEDOCK_PATH_PATTERN.match(url_parse.path)
if match:
# Process SpaceDock modules together in one huge batch
sd_query.add(ckan.identifier, int(match.group(1)))
else:
logging.error('Failed to parse SD URL for %s: %s',
ckan.identifier, ckan.download)
elif url_parse.netloc == 'archive.org':
ia_query.add(ckan)
if ia_query.full():
ia_query.get_result(self.counts)
ia_query = InternetArchiveBatchedQuery()
except Exception as exc: # pylint: disable=broad-except
# Don't let one bad apple spoil the bunch
# Print file path because netkan_dl might be None
logging.error('DownloadCounter failed for %s',
ckan.identifier, exc_info=exc)
for download in ckan.downloads:
try:
url_parse = urllib.parse.urlparse(download)
if url_parse.netloc == 'github.com':
match = self.GITHUB_PATH_PATTERN.match(url_parse.path)
if match:
# Process GitHub modules together in big batches
graph_query.add(ckan.identifier, *match.groups())
if graph_query.full():
# Run the query
graph_query.get_result(self.counts)
# Clear request list
graph_query.clear()
elif url_parse.netloc == 'spacedock.info':
match = self.SPACEDOCK_PATH_PATTERN.match(url_parse.path)
if match:
# Process SpaceDock modules together in one huge batch
sd_query.add(ckan.identifier, int(match.group(1)))
else:
logging.error('Failed to parse SD URL for %s: %s',
ckan.identifier, download)
elif url_parse.netloc == 'archive.org':
ia_query.add(ckan)
if ia_query.full():
ia_query.get_result(self.counts)
ia_query = InternetArchiveBatchedQuery()
except Exception as exc: # pylint: disable=broad-except
# Don't let one bad apple spoil the bunch
# Print file path because netkan_dl might be None
logging.error('DownloadCounter failed for %s',
ckan.identifier, exc_info=exc)
if not sd_query.empty():
sd_query.get_result(self.counts)
if not graph_query.empty():
Expand All @@ -275,7 +285,7 @@ def commit_counts(self) -> None:
if self.output_file:
self.ckm_repo.commit(
[self.output_file.as_posix()],
'NetKAN Updating Download Counts'
'NetKAN updating download counts'
)
logging.info('Download counts changed and committed')
self.ckm_repo.push_remote_primary()
Expand Down
7 changes: 4 additions & 3 deletions netkan/netkan/github_pr.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
from typing import Optional, List, Dict, Any
from typing import Optional, List, Dict, Any, Union
from github import Github, GithubException
from github.Repository import Repository

Expand Down Expand Up @@ -52,8 +52,9 @@ def create_pull_request(self, title: str, branch: str, body: str, labels: Option
logging.info('Comment added with id %s', comment.id)

@staticmethod
def get_error_message(exc_data: Dict[str, Any]) -> str:
return ' - '.join([exc_data.get('message',
def get_error_message(exc_data: Union[str, Dict[str, Any]]) -> str:
return exc_data if isinstance(exc_data, str) else ' - '.join([
exc_data.get('message',
'Unknown error'),
*(err['message']
for err in exc_data.get('errors', [])
Expand Down
67 changes: 67 additions & 0 deletions netkan/netkan/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,38 @@ def sqs_message(


class Ckan:
REDISTRIBUTABLE_LICENSES = {
"public-domain",
"Apache", "Apache-1.0", "Apache-2.0",
"Artistic", "Artistic-1.0", "Artistic-2.0",
"BSD-2-clause", "BSD-3-clause", "BSD-4-clause",
"ISC",
"CC-BY", "CC-BY-1.0", "CC-BY-2.0", "CC-BY-2.5", "CC-BY-3.0", "CC-BY-4.0",
"CC-BY-SA", "CC-BY-SA-1.0", "CC-BY-SA-2.0", "CC-BY-SA-2.5", "CC-BY-SA-3.0", "CC-BY-SA-4.0",
"CC-BY-NC", "CC-BY-NC-1.0", "CC-BY-NC-2.0", "CC-BY-NC-2.5", "CC-BY-NC-3.0", "CC-BY-NC-4.0",
"CC-BY-NC-SA", "CC-BY-NC-SA-1.0", "CC-BY-NC-SA-2.0", "CC-BY-NC-SA-2.5", "CC-BY-NC-SA-3.0", "CC-BY-NC-SA-4.0",
"CC-BY-NC-ND", "CC-BY-NC-ND-1.0", "CC-BY-NC-ND-2.0", "CC-BY-NC-ND-2.5", "CC-BY-NC-ND-3.0", "CC-BY-NC-ND-4.0",
"CC-BY-ND", "CC-BY-ND-1.0", "CC-BY-ND-2.0", "CC-BY-ND-2.5", "CC-BY-ND-3.0", "CC-BY-ND-4.0",
"CC0",
"CDDL", "CPL",
"EFL-1.0", "EFL-2.0",
"Expat", "MIT",
"GPL-1.0", "GPL-2.0", "GPL-3.0",
"LGPL-2.0", "LGPL-2.1", "LGPL-3.0",
"GFDL-1.0", "GFDL-1.1", "GFDL-1.2", "GFDL-1.3",
"GFDL-NIV-1.0", "GFDL-NIV-1.1", "GFDL-NIV-1.2", "GFDL-NIV-1.3",
"LPPL-1.0", "LPPL-1.1", "LPPL-1.2", "LPPL-1.3c",
"MPL-1.1", "MPL-2.0",
"Perl",
"Python-2.0",
"QPL-1.0",
"W3C",
"Zlib",
"Zope",
"WTFPL",
"Unlicense",
"open-source", "unrestricted"
}

@total_ordering
class Version:
Expand Down Expand Up @@ -321,6 +353,23 @@ def version(self) -> Version:
raise AttributeError('Required property `version` not found')
return self.Version(raw_ver)

# download can be a list now, default to the first one
@property
def download(self) -> str:
download = self._raw.get('download')
if isinstance(download, list):
return download[0] if len(download) > 0 else None
return download

# Provide all downloads with alternate property in case we need them,
# including implicit archive.org fallback where applicable
@property
def downloads(self) -> List[str]:
download = self._raw['download']
downloads = download if isinstance(download, list) else [download]
archive = self.mirror_download() if self.redistributable else None
return [*downloads, archive] if archive else downloads

@property
def cache_prefix(self) -> Optional[str]:
if 'download' not in self._raw:
Expand Down Expand Up @@ -372,3 +421,21 @@ def authors(self) -> List[str]:
def licenses(self) -> List[str]:
lic = self.license
return lic if isinstance(lic, list) else [lic]

@property
def redistributable(self) -> bool:
for lic in self.licenses():
if lic in self.REDISTRIBUTABLE_LICENSES:
return True
return False

def mirror_filename(self, with_epoch: bool = True) -> Optional[str]:
if 'download_hash' not in self._raw:
return None
return f'{self.download_hash["sha1"][0:8]}-{self.identifier}-{self._format_version(with_epoch)}.{Ckan.MIME_TO_EXTENSION[self.download_content_type]}'

def mirror_download(self, with_epoch: bool = True) -> Optional[str]:
filename = self.mirror_filename(with_epoch)
if filename:
return f'https://archive.org/download/{self.identifier}-{self._format_version(with_epoch)}/{filename}'
return None
52 changes: 4 additions & 48 deletions netkan/netkan/mirrorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,39 +28,6 @@ class CkanMirror(Ckan):
DESCRIPTION_TEMPLATE = Template(
read_text('netkan', 'mirror_description_template.jinja2'))

REDISTRIBUTABLE_LICENSES = {
"public-domain",
"Apache", "Apache-1.0", "Apache-2.0",
"Artistic", "Artistic-1.0", "Artistic-2.0",
"BSD-2-clause", "BSD-3-clause", "BSD-4-clause",
"ISC",
"CC-BY", "CC-BY-1.0", "CC-BY-2.0", "CC-BY-2.5", "CC-BY-3.0", "CC-BY-4.0",
"CC-BY-SA", "CC-BY-SA-1.0", "CC-BY-SA-2.0", "CC-BY-SA-2.5", "CC-BY-SA-3.0", "CC-BY-SA-4.0",
"CC-BY-NC", "CC-BY-NC-1.0", "CC-BY-NC-2.0", "CC-BY-NC-2.5", "CC-BY-NC-3.0", "CC-BY-NC-4.0",
"CC-BY-NC-SA", "CC-BY-NC-SA-1.0", "CC-BY-NC-SA-2.0", "CC-BY-NC-SA-2.5", "CC-BY-NC-SA-3.0", "CC-BY-NC-SA-4.0",
"CC-BY-NC-ND", "CC-BY-NC-ND-1.0", "CC-BY-NC-ND-2.0", "CC-BY-NC-ND-2.5", "CC-BY-NC-ND-3.0", "CC-BY-NC-ND-4.0",
"CC-BY-ND", "CC-BY-ND-1.0", "CC-BY-ND-2.0", "CC-BY-ND-2.5", "CC-BY-ND-3.0", "CC-BY-ND-4.0",
"CC0",
"CDDL", "CPL",
"EFL-1.0", "EFL-2.0",
"Expat", "MIT",
"GPL-1.0", "GPL-2.0", "GPL-3.0",
"LGPL-2.0", "LGPL-2.1", "LGPL-3.0",
"GFDL-1.0", "GFDL-1.1", "GFDL-1.2", "GFDL-1.3",
"GFDL-NIV-1.0", "GFDL-NIV-1.1", "GFDL-NIV-1.2", "GFDL-NIV-1.3",
"LPPL-1.0", "LPPL-1.1", "LPPL-1.2", "LPPL-1.3c",
"MPL-1.1", "MPL-2.0",
"Perl",
"Python-2.0",
"QPL-1.0",
"W3C",
"Zlib",
"Zope",
"WTFPL",
"Unlicense",
"open-source", "unrestricted"
}

LICENSE_URLS = {
"Apache" : 'http://www.apache.org/licenses/LICENSE-1.0',
"Apache-1.0" : 'http://www.apache.org/licenses/LICENSE-1.0',
Expand Down Expand Up @@ -163,21 +130,9 @@ def license_urls(self) -> List[str]:
return [self.LICENSE_URLS[lic]
for lic in self.licenses() if lic in self.LICENSE_URLS]

@property
def redistributable(self) -> bool:
for lic in self.licenses():
if lic in self.REDISTRIBUTABLE_LICENSES:
return True
return False

def mirror_item(self, with_epoch: bool = True) -> str:
return f'{self.identifier}-{self._format_version(with_epoch)}'

def mirror_filename(self, with_epoch: bool = True) -> Optional[str]:
if 'download_hash' not in self._raw:
return None
return f'{self.download_hash["sha1"][0:8]}-{self.identifier}-{self._format_version(with_epoch)}.{Ckan.MIME_TO_EXTENSION[self.download_content_type]}'

def mirror_source_filename(self, with_epoch: bool = True) -> str:
return f'{self.identifier}-{self._format_version(with_epoch)}.source.zip'

Expand Down Expand Up @@ -379,8 +334,9 @@ def purge_epochs(self, dry_run: bool) -> None:
if dry_run:
logging.info('Dry run mode enabled, no changes will be made')
for result in self._epoch_search():
ident = result.get('identifier')
if ident:
if 'ident' in result:
# https://internetarchive.readthedocs.io/en/stable/internetarchive.html#internetarchive.Search
ident = result['identifier'] # type: ignore[index]
item = self.ia_session.get_item(ident)
logging.info('Found epoch to purge: %s (%s)', ident, item.metadata.get('title'))
if not dry_run:
Expand All @@ -391,7 +347,7 @@ def purge_epochs(self, dry_run: bool) -> None:

def _epoch_search(self) -> Iterable[internetarchive.Search]:
return filter(
self._result_has_epoch,
self._result_has_epoch, # type: ignore[arg-type]
self.ia_session.search_items(
f'collection:({self.ia_collection})',
fields=['identifier', 'title']
Expand Down