Skip to content

Commit

Permalink
Handle multiple download URLs per module
Browse files Browse the repository at this point in the history
  • Loading branch information
HebaruSan committed Aug 9, 2023
1 parent 5b7612b commit 323c842
Show file tree
Hide file tree
Showing 4 changed files with 120 additions and 86 deletions.
80 changes: 45 additions & 35 deletions netkan/netkan/download_counter.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,10 @@ def get_result(self, counts: Optional[Dict[str, int]] = None) -> Dict[str, int]:
if user_repo in self.cache:
count = self.cache[user_repo]
logging.info('Count for %s is %s', ident, count)
counts[ident] = count
if ident in counts:
counts[ident] += count
else:
counts[ident] = count
return counts

def graphql_to_github(self, query: str) -> Dict[str, Any]:
Expand Down Expand Up @@ -168,7 +171,10 @@ def get_result(self, counts: Optional[Dict[str, int]] = None) -> Dict[str, int]:
count = sd_counts.get(sd_id)
if count:
logging.info('Count for %s is %s', identifier, count)
counts[identifier] = count
if identifier in counts:
counts[identifier] += count
else:
counts[identifier] = count
return counts


Expand Down Expand Up @@ -201,7 +207,10 @@ def get_result(self, counts: Optional[Dict[str, int]] = None) -> Dict[str, int]:
result = requests.get(self.IARCHIVE_API + ','.join(self.ids.values()),
timeout=60).json()
for ckan_ident, ia_ident in self.ids.items():
counts[ckan_ident] = result[ia_ident]['all_time']
if ckan_ident in counts:
counts[ckan_ident] += result[ia_ident]['all_time']
else:
counts[ckan_ident] = result[ia_ident]['all_time']
return counts


Expand All @@ -223,39 +232,40 @@ def get_counts(self) -> None:
graph_query = GraphQLQuery(self.github_token)
sd_query = SpaceDockBatchedQuery()
ia_query = InternetArchiveBatchedQuery()
for ckan in self.ckm_repo.all_latest_modules():
for ckan in self.ckm_repo.all_latest_modules(): # pylint: disable=too-many-nested-blocks
if ckan.kind == 'dlc':
continue
try:
url_parse = urllib.parse.urlparse(ckan.download)
if url_parse.netloc == 'github.com':
match = self.GITHUB_PATH_PATTERN.match(url_parse.path)
if match:
# Process GitHub modules together in big batches
graph_query.add(ckan.identifier, *match.groups())
if graph_query.full():
# Run the query
graph_query.get_result(self.counts)
# Clear request list
graph_query.clear()
elif url_parse.netloc == 'spacedock.info':
match = self.SPACEDOCK_PATH_PATTERN.match(url_parse.path)
if match:
# Process SpaceDock modules together in one huge batch
sd_query.add(ckan.identifier, int(match.group(1)))
else:
logging.error('Failed to parse SD URL for %s: %s',
ckan.identifier, ckan.download)
elif url_parse.netloc == 'archive.org':
ia_query.add(ckan)
if ia_query.full():
ia_query.get_result(self.counts)
ia_query = InternetArchiveBatchedQuery()
except Exception as exc: # pylint: disable=broad-except
# Don't let one bad apple spoil the bunch
# Print file path because netkan_dl might be None
logging.error('DownloadCounter failed for %s',
ckan.identifier, exc_info=exc)
for download in ckan.downloads:
try:
url_parse = urllib.parse.urlparse(download)
if url_parse.netloc == 'github.com':
match = self.GITHUB_PATH_PATTERN.match(url_parse.path)
if match:
# Process GitHub modules together in big batches
graph_query.add(ckan.identifier, *match.groups())
if graph_query.full():
# Run the query
graph_query.get_result(self.counts)
# Clear request list
graph_query.clear()
elif url_parse.netloc == 'spacedock.info':
match = self.SPACEDOCK_PATH_PATTERN.match(url_parse.path)
if match:
# Process SpaceDock modules together in one huge batch
sd_query.add(ckan.identifier, int(match.group(1)))
else:
logging.error('Failed to parse SD URL for %s: %s',
ckan.identifier, download)
elif url_parse.netloc == 'archive.org':
ia_query.add(ckan)
if ia_query.full():
ia_query.get_result(self.counts)
ia_query = InternetArchiveBatchedQuery()
except Exception as exc: # pylint: disable=broad-except
# Don't let one bad apple spoil the bunch
# Print file path because netkan_dl might be None
logging.error('DownloadCounter failed for %s',
ckan.identifier, exc_info=exc)
if not sd_query.empty():
sd_query.get_result(self.counts)
if not graph_query.empty():
Expand All @@ -275,7 +285,7 @@ def commit_counts(self) -> None:
if self.output_file:
self.ckm_repo.commit(
[self.output_file.as_posix()],
'NetKAN Updating Download Counts'
'NetKAN updating download counts'
)
logging.info('Download counts changed and committed')
self.ckm_repo.push_remote_primary()
Expand Down
7 changes: 4 additions & 3 deletions netkan/netkan/github_pr.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import logging
from typing import Optional, List, Dict, Any
from typing import Optional, List, Dict, Any, Union
from github import Github, GithubException
from github.Repository import Repository

Expand Down Expand Up @@ -52,8 +52,9 @@ def create_pull_request(self, title: str, branch: str, body: str, labels: Option
logging.info('Comment added with id %s', comment.id)

@staticmethod
def get_error_message(exc_data: Dict[str, Any]) -> str:
return ' - '.join([exc_data.get('message',
def get_error_message(exc_data: Union[str, Dict[str, Any]]) -> str:
return exc_data if isinstance(exc_data, str) else ' - '.join([
exc_data.get('message',
'Unknown error'),
*(err['message']
for err in exc_data.get('errors', [])
Expand Down
67 changes: 67 additions & 0 deletions netkan/netkan/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,38 @@ def sqs_message(


class Ckan:
REDISTRIBUTABLE_LICENSES = {
"public-domain",
"Apache", "Apache-1.0", "Apache-2.0",
"Artistic", "Artistic-1.0", "Artistic-2.0",
"BSD-2-clause", "BSD-3-clause", "BSD-4-clause",
"ISC",
"CC-BY", "CC-BY-1.0", "CC-BY-2.0", "CC-BY-2.5", "CC-BY-3.0", "CC-BY-4.0",
"CC-BY-SA", "CC-BY-SA-1.0", "CC-BY-SA-2.0", "CC-BY-SA-2.5", "CC-BY-SA-3.0", "CC-BY-SA-4.0",
"CC-BY-NC", "CC-BY-NC-1.0", "CC-BY-NC-2.0", "CC-BY-NC-2.5", "CC-BY-NC-3.0", "CC-BY-NC-4.0",
"CC-BY-NC-SA", "CC-BY-NC-SA-1.0", "CC-BY-NC-SA-2.0", "CC-BY-NC-SA-2.5", "CC-BY-NC-SA-3.0", "CC-BY-NC-SA-4.0",
"CC-BY-NC-ND", "CC-BY-NC-ND-1.0", "CC-BY-NC-ND-2.0", "CC-BY-NC-ND-2.5", "CC-BY-NC-ND-3.0", "CC-BY-NC-ND-4.0",
"CC-BY-ND", "CC-BY-ND-1.0", "CC-BY-ND-2.0", "CC-BY-ND-2.5", "CC-BY-ND-3.0", "CC-BY-ND-4.0",
"CC0",
"CDDL", "CPL",
"EFL-1.0", "EFL-2.0",
"Expat", "MIT",
"GPL-1.0", "GPL-2.0", "GPL-3.0",
"LGPL-2.0", "LGPL-2.1", "LGPL-3.0",
"GFDL-1.0", "GFDL-1.1", "GFDL-1.2", "GFDL-1.3",
"GFDL-NIV-1.0", "GFDL-NIV-1.1", "GFDL-NIV-1.2", "GFDL-NIV-1.3",
"LPPL-1.0", "LPPL-1.1", "LPPL-1.2", "LPPL-1.3c",
"MPL-1.1", "MPL-2.0",
"Perl",
"Python-2.0",
"QPL-1.0",
"W3C",
"Zlib",
"Zope",
"WTFPL",
"Unlicense",
"open-source", "unrestricted"
}

@total_ordering
class Version:
Expand Down Expand Up @@ -321,6 +353,23 @@ def version(self) -> Version:
raise AttributeError('Required property `version` not found')
return self.Version(raw_ver)

# download can be a list now, default to the first one
@property
def download(self) -> str:
download = self._raw.get('download')
if isinstance(download, list):
return download[0] if len(download) > 0 else None
return download

# Provide all downloads with alternate property in case we need them,
# including implicit archive.org fallback where applicable
@property
def downloads(self) -> List[str]:
download = self._raw['download']
downloads = download if isinstance(download, list) else [download]
archive = self.mirror_download() if self.redistributable else None
return [*downloads, archive] if archive else downloads

@property
def cache_prefix(self) -> Optional[str]:
if 'download' not in self._raw:
Expand Down Expand Up @@ -372,3 +421,21 @@ def authors(self) -> List[str]:
def licenses(self) -> List[str]:
lic = self.license
return lic if isinstance(lic, list) else [lic]

@property
def redistributable(self) -> bool:
for lic in self.licenses():
if lic in self.REDISTRIBUTABLE_LICENSES:
return True
return False

def mirror_filename(self, with_epoch: bool = True) -> Optional[str]:
if 'download_hash' not in self._raw:
return None
return f'{self.download_hash["sha1"][0:8]}-{self.identifier}-{self._format_version(with_epoch)}.{Ckan.MIME_TO_EXTENSION[self.download_content_type]}'

def mirror_download(self, with_epoch: bool = True) -> Optional[str]:
filename = self.mirror_filename(with_epoch)
if filename:
return f'https://archive.org/download/{self.identifier}-{self._format_version(with_epoch)}/{filename}'
return None
52 changes: 4 additions & 48 deletions netkan/netkan/mirrorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,39 +28,6 @@ class CkanMirror(Ckan):
DESCRIPTION_TEMPLATE = Template(
read_text('netkan', 'mirror_description_template.jinja2'))

REDISTRIBUTABLE_LICENSES = {
"public-domain",
"Apache", "Apache-1.0", "Apache-2.0",
"Artistic", "Artistic-1.0", "Artistic-2.0",
"BSD-2-clause", "BSD-3-clause", "BSD-4-clause",
"ISC",
"CC-BY", "CC-BY-1.0", "CC-BY-2.0", "CC-BY-2.5", "CC-BY-3.0", "CC-BY-4.0",
"CC-BY-SA", "CC-BY-SA-1.0", "CC-BY-SA-2.0", "CC-BY-SA-2.5", "CC-BY-SA-3.0", "CC-BY-SA-4.0",
"CC-BY-NC", "CC-BY-NC-1.0", "CC-BY-NC-2.0", "CC-BY-NC-2.5", "CC-BY-NC-3.0", "CC-BY-NC-4.0",
"CC-BY-NC-SA", "CC-BY-NC-SA-1.0", "CC-BY-NC-SA-2.0", "CC-BY-NC-SA-2.5", "CC-BY-NC-SA-3.0", "CC-BY-NC-SA-4.0",
"CC-BY-NC-ND", "CC-BY-NC-ND-1.0", "CC-BY-NC-ND-2.0", "CC-BY-NC-ND-2.5", "CC-BY-NC-ND-3.0", "CC-BY-NC-ND-4.0",
"CC-BY-ND", "CC-BY-ND-1.0", "CC-BY-ND-2.0", "CC-BY-ND-2.5", "CC-BY-ND-3.0", "CC-BY-ND-4.0",
"CC0",
"CDDL", "CPL",
"EFL-1.0", "EFL-2.0",
"Expat", "MIT",
"GPL-1.0", "GPL-2.0", "GPL-3.0",
"LGPL-2.0", "LGPL-2.1", "LGPL-3.0",
"GFDL-1.0", "GFDL-1.1", "GFDL-1.2", "GFDL-1.3",
"GFDL-NIV-1.0", "GFDL-NIV-1.1", "GFDL-NIV-1.2", "GFDL-NIV-1.3",
"LPPL-1.0", "LPPL-1.1", "LPPL-1.2", "LPPL-1.3c",
"MPL-1.1", "MPL-2.0",
"Perl",
"Python-2.0",
"QPL-1.0",
"W3C",
"Zlib",
"Zope",
"WTFPL",
"Unlicense",
"open-source", "unrestricted"
}

LICENSE_URLS = {
"Apache" : 'http://www.apache.org/licenses/LICENSE-1.0',
"Apache-1.0" : 'http://www.apache.org/licenses/LICENSE-1.0',
Expand Down Expand Up @@ -163,21 +130,9 @@ def license_urls(self) -> List[str]:
return [self.LICENSE_URLS[lic]
for lic in self.licenses() if lic in self.LICENSE_URLS]

@property
def redistributable(self) -> bool:
for lic in self.licenses():
if lic in self.REDISTRIBUTABLE_LICENSES:
return True
return False

def mirror_item(self, with_epoch: bool = True) -> str:
return f'{self.identifier}-{self._format_version(with_epoch)}'

def mirror_filename(self, with_epoch: bool = True) -> Optional[str]:
if 'download_hash' not in self._raw:
return None
return f'{self.download_hash["sha1"][0:8]}-{self.identifier}-{self._format_version(with_epoch)}.{Ckan.MIME_TO_EXTENSION[self.download_content_type]}'

def mirror_source_filename(self, with_epoch: bool = True) -> str:
return f'{self.identifier}-{self._format_version(with_epoch)}.source.zip'

Expand Down Expand Up @@ -379,8 +334,9 @@ def purge_epochs(self, dry_run: bool) -> None:
if dry_run:
logging.info('Dry run mode enabled, no changes will be made')
for result in self._epoch_search():
ident = result.get('identifier')
if ident:
if 'ident' in result:
# https://internetarchive.readthedocs.io/en/stable/internetarchive.html#internetarchive.Search
ident = result['identifier'] # type: ignore[index]
item = self.ia_session.get_item(ident)
logging.info('Found epoch to purge: %s (%s)', ident, item.metadata.get('title'))
if not dry_run:
Expand All @@ -391,7 +347,7 @@ def purge_epochs(self, dry_run: bool) -> None:

def _epoch_search(self) -> Iterable[internetarchive.Search]:
return filter(
self._result_has_epoch,
self._result_has_epoch, # type: ignore[arg-type]
self.ia_session.search_items(
f'collection:({self.ia_collection})',
fields=['identifier', 'title']
Expand Down

0 comments on commit 323c842

Please sign in to comment.