Handle multiple download URLs per module

KSP-CKAN · Aug 9, 2023 · 323c842 · 323c842
1 parent 5b7612b
commit 323c842
Show file tree

Hide file tree

Showing 4 changed files with 120 additions and 86 deletions.
diff --git a/netkan/netkan/download_counter.py b/netkan/netkan/download_counter.py
@@ -113,7 +113,10 @@ def get_result(self, counts: Optional[Dict[str, int]] = None) -> Dict[str, int]:
             if user_repo in self.cache:
                 count = self.cache[user_repo]
                 logging.info('Count for %s is %s', ident, count)
-                counts[ident] = count
+                if ident in counts:
+                    counts[ident] += count
+                else:
+                    counts[ident] = count
         return counts
 
     def graphql_to_github(self, query: str) -> Dict[str, Any]:
@@ -168,7 +171,10 @@ def get_result(self, counts: Optional[Dict[str, int]] = None) -> Dict[str, int]:
             count = sd_counts.get(sd_id)
             if count:
                 logging.info('Count for %s is %s', identifier, count)
-                counts[identifier] = count
+                if identifier in counts:
+                    counts[identifier] += count
+                else:
+                    counts[identifier] = count
         return counts
 
 
@@ -201,7 +207,10 @@ def get_result(self, counts: Optional[Dict[str, int]] = None) -> Dict[str, int]:
         result = requests.get(self.IARCHIVE_API + ','.join(self.ids.values()),
                               timeout=60).json()
         for ckan_ident, ia_ident in self.ids.items():
-            counts[ckan_ident] = result[ia_ident]['all_time']
+            if ckan_ident in counts:
+                counts[ckan_ident] += result[ia_ident]['all_time']
+            else:
+                counts[ckan_ident] = result[ia_ident]['all_time']
         return counts
 
 
@@ -223,39 +232,40 @@ def get_counts(self) -> None:
         graph_query = GraphQLQuery(self.github_token)
         sd_query = SpaceDockBatchedQuery()
         ia_query = InternetArchiveBatchedQuery()
-        for ckan in self.ckm_repo.all_latest_modules():
+        for ckan in self.ckm_repo.all_latest_modules():  # pylint: disable=too-many-nested-blocks
             if ckan.kind == 'dlc':
                 continue
-            try:
-                url_parse = urllib.parse.urlparse(ckan.download)
-                if url_parse.netloc == 'github.com':
-                    match = self.GITHUB_PATH_PATTERN.match(url_parse.path)
-                    if match:
-                        # Process GitHub modules together in big batches
-                        graph_query.add(ckan.identifier, *match.groups())
-                        if graph_query.full():
-                            # Run the query
-                            graph_query.get_result(self.counts)
-                            # Clear request list
-                            graph_query.clear()
-                elif url_parse.netloc == 'spacedock.info':
-                    match = self.SPACEDOCK_PATH_PATTERN.match(url_parse.path)
-                    if match:
-                        # Process SpaceDock modules together in one huge batch
-                        sd_query.add(ckan.identifier, int(match.group(1)))
-                    else:
-                        logging.error('Failed to parse SD URL for %s: %s',
-                                      ckan.identifier, ckan.download)
-                elif url_parse.netloc == 'archive.org':
-                    ia_query.add(ckan)
-                    if ia_query.full():
-                        ia_query.get_result(self.counts)
-                        ia_query = InternetArchiveBatchedQuery()
-            except Exception as exc:  # pylint: disable=broad-except
-                # Don't let one bad apple spoil the bunch
-                # Print file path because netkan_dl might be None
-                logging.error('DownloadCounter failed for %s',
-                              ckan.identifier, exc_info=exc)
+            for download in ckan.downloads:
+                try:
+                    url_parse = urllib.parse.urlparse(download)
+                    if url_parse.netloc == 'github.com':
+                        match = self.GITHUB_PATH_PATTERN.match(url_parse.path)
+                        if match:
+                            # Process GitHub modules together in big batches
+                            graph_query.add(ckan.identifier, *match.groups())
+                            if graph_query.full():
+                                # Run the query
+                                graph_query.get_result(self.counts)
+                                # Clear request list
+                                graph_query.clear()
+                    elif url_parse.netloc == 'spacedock.info':
+                        match = self.SPACEDOCK_PATH_PATTERN.match(url_parse.path)
+                        if match:
+                            # Process SpaceDock modules together in one huge batch
+                            sd_query.add(ckan.identifier, int(match.group(1)))
+                        else:
+                            logging.error('Failed to parse SD URL for %s: %s',
+                                          ckan.identifier, download)
+                    elif url_parse.netloc == 'archive.org':
+                        ia_query.add(ckan)
+                        if ia_query.full():
+                            ia_query.get_result(self.counts)
+                            ia_query = InternetArchiveBatchedQuery()
+                except Exception as exc:  # pylint: disable=broad-except
+                    # Don't let one bad apple spoil the bunch
+                    # Print file path because netkan_dl might be None
+                    logging.error('DownloadCounter failed for %s',
+                                  ckan.identifier, exc_info=exc)
         if not sd_query.empty():
             sd_query.get_result(self.counts)
         if not graph_query.empty():
@@ -275,7 +285,7 @@ def commit_counts(self) -> None:
         if self.output_file:
             self.ckm_repo.commit(
                 [self.output_file.as_posix()],
-                'NetKAN Updating Download Counts'
+                'NetKAN updating download counts'
             )
             logging.info('Download counts changed and committed')
             self.ckm_repo.push_remote_primary()

diff --git a/netkan/netkan/github_pr.py b/netkan/netkan/github_pr.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Optional, List, Dict, Any
+from typing import Optional, List, Dict, Any, Union
 from github import Github, GithubException
 from github.Repository import Repository
 
@@ -52,8 +52,9 @@ def create_pull_request(self, title: str, branch: str, body: str, labels: Option
                 logging.info('Comment added with id %s', comment.id)
 
     @staticmethod
-    def get_error_message(exc_data: Dict[str, Any]) -> str:
-        return ' - '.join([exc_data.get('message',
+    def get_error_message(exc_data: Union[str, Dict[str, Any]]) -> str:
+        return exc_data if isinstance(exc_data, str) else ' - '.join([
+                           exc_data.get('message',
                                         'Unknown error'),
                            *(err['message']
                              for err in exc_data.get('errors', [])

diff --git a/netkan/netkan/metadata.py b/netkan/netkan/metadata.py
@@ -116,6 +116,38 @@ def sqs_message(
 
 
 class Ckan:
+    REDISTRIBUTABLE_LICENSES = {
+        "public-domain",
+        "Apache", "Apache-1.0", "Apache-2.0",
+        "Artistic", "Artistic-1.0", "Artistic-2.0",
+        "BSD-2-clause", "BSD-3-clause", "BSD-4-clause",
+        "ISC",
+        "CC-BY", "CC-BY-1.0", "CC-BY-2.0", "CC-BY-2.5", "CC-BY-3.0", "CC-BY-4.0",
+        "CC-BY-SA", "CC-BY-SA-1.0", "CC-BY-SA-2.0", "CC-BY-SA-2.5", "CC-BY-SA-3.0", "CC-BY-SA-4.0",
+        "CC-BY-NC", "CC-BY-NC-1.0", "CC-BY-NC-2.0", "CC-BY-NC-2.5", "CC-BY-NC-3.0", "CC-BY-NC-4.0",
+        "CC-BY-NC-SA", "CC-BY-NC-SA-1.0", "CC-BY-NC-SA-2.0", "CC-BY-NC-SA-2.5", "CC-BY-NC-SA-3.0", "CC-BY-NC-SA-4.0",
+        "CC-BY-NC-ND", "CC-BY-NC-ND-1.0", "CC-BY-NC-ND-2.0", "CC-BY-NC-ND-2.5", "CC-BY-NC-ND-3.0", "CC-BY-NC-ND-4.0",
+        "CC-BY-ND", "CC-BY-ND-1.0", "CC-BY-ND-2.0", "CC-BY-ND-2.5", "CC-BY-ND-3.0", "CC-BY-ND-4.0",
+        "CC0",
+        "CDDL", "CPL",
+        "EFL-1.0", "EFL-2.0",
+        "Expat", "MIT",
+        "GPL-1.0", "GPL-2.0", "GPL-3.0",
+        "LGPL-2.0", "LGPL-2.1", "LGPL-3.0",
+        "GFDL-1.0", "GFDL-1.1", "GFDL-1.2", "GFDL-1.3",
+        "GFDL-NIV-1.0", "GFDL-NIV-1.1", "GFDL-NIV-1.2", "GFDL-NIV-1.3",
+        "LPPL-1.0", "LPPL-1.1", "LPPL-1.2", "LPPL-1.3c",
+        "MPL-1.1", "MPL-2.0",
+        "Perl",
+        "Python-2.0",
+        "QPL-1.0",
+        "W3C",
+        "Zlib",
+        "Zope",
+        "WTFPL",
+        "Unlicense",
+        "open-source", "unrestricted"
+    }
 
     @total_ordering
     class Version:
@@ -321,6 +353,23 @@ def version(self) -> Version:
             raise AttributeError('Required property `version` not found')
         return self.Version(raw_ver)
 
+    # download can be a list now, default to the first one
+    @property
+    def download(self) -> str:
+        download = self._raw.get('download')
+        if isinstance(download, list):
+            return download[0] if len(download) > 0 else None
+        return download
+
+    # Provide all downloads with alternate property in case we need them,
+    # including implicit archive.org fallback where applicable
+    @property
+    def downloads(self) -> List[str]:
+        download = self._raw['download']
+        downloads = download if isinstance(download, list) else [download]
+        archive = self.mirror_download() if self.redistributable else None
+        return [*downloads, archive] if archive else downloads
+
     @property
     def cache_prefix(self) -> Optional[str]:
         if 'download' not in self._raw:
@@ -372,3 +421,21 @@ def authors(self) -> List[str]:
     def licenses(self) -> List[str]:
         lic = self.license
         return lic if isinstance(lic, list) else [lic]
+
+    @property
+    def redistributable(self) -> bool:
+        for lic in self.licenses():
+            if lic in self.REDISTRIBUTABLE_LICENSES:
+                return True
+        return False
+
+    def mirror_filename(self, with_epoch: bool = True) -> Optional[str]:
+        if 'download_hash' not in self._raw:
+            return None
+        return f'{self.download_hash["sha1"][0:8]}-{self.identifier}-{self._format_version(with_epoch)}.{Ckan.MIME_TO_EXTENSION[self.download_content_type]}'
+
+    def mirror_download(self, with_epoch: bool = True) -> Optional[str]:
+        filename = self.mirror_filename(with_epoch)
+        if filename:
+            return f'https://archive.org/download/{self.identifier}-{self._format_version(with_epoch)}/{filename}'
+        return None
diff --git a/netkan/netkan/mirrorer.py b/netkan/netkan/mirrorer.py
@@ -28,39 +28,6 @@ class CkanMirror(Ckan):
     DESCRIPTION_TEMPLATE = Template(
         read_text('netkan', 'mirror_description_template.jinja2'))
 
-    REDISTRIBUTABLE_LICENSES = {
-        "public-domain",
-        "Apache", "Apache-1.0", "Apache-2.0",
-        "Artistic", "Artistic-1.0", "Artistic-2.0",
-        "BSD-2-clause", "BSD-3-clause", "BSD-4-clause",
-        "ISC",
-        "CC-BY", "CC-BY-1.0", "CC-BY-2.0", "CC-BY-2.5", "CC-BY-3.0", "CC-BY-4.0",
-        "CC-BY-SA", "CC-BY-SA-1.0", "CC-BY-SA-2.0", "CC-BY-SA-2.5", "CC-BY-SA-3.0", "CC-BY-SA-4.0",
-        "CC-BY-NC", "CC-BY-NC-1.0", "CC-BY-NC-2.0", "CC-BY-NC-2.5", "CC-BY-NC-3.0", "CC-BY-NC-4.0",
-        "CC-BY-NC-SA", "CC-BY-NC-SA-1.0", "CC-BY-NC-SA-2.0", "CC-BY-NC-SA-2.5", "CC-BY-NC-SA-3.0", "CC-BY-NC-SA-4.0",
-        "CC-BY-NC-ND", "CC-BY-NC-ND-1.0", "CC-BY-NC-ND-2.0", "CC-BY-NC-ND-2.5", "CC-BY-NC-ND-3.0", "CC-BY-NC-ND-4.0",
-        "CC-BY-ND", "CC-BY-ND-1.0", "CC-BY-ND-2.0", "CC-BY-ND-2.5", "CC-BY-ND-3.0", "CC-BY-ND-4.0",
-        "CC0",
-        "CDDL", "CPL",
-        "EFL-1.0", "EFL-2.0",
-        "Expat", "MIT",
-        "GPL-1.0", "GPL-2.0", "GPL-3.0",
-        "LGPL-2.0", "LGPL-2.1", "LGPL-3.0",
-        "GFDL-1.0", "GFDL-1.1", "GFDL-1.2", "GFDL-1.3",
-        "GFDL-NIV-1.0", "GFDL-NIV-1.1", "GFDL-NIV-1.2", "GFDL-NIV-1.3",
-        "LPPL-1.0", "LPPL-1.1", "LPPL-1.2", "LPPL-1.3c",
-        "MPL-1.1", "MPL-2.0",
-        "Perl",
-        "Python-2.0",
-        "QPL-1.0",
-        "W3C",
-        "Zlib",
-        "Zope",
-        "WTFPL",
-        "Unlicense",
-        "open-source", "unrestricted"
-    }
-
     LICENSE_URLS = {
         "Apache"            : 'http://www.apache.org/licenses/LICENSE-1.0',
         "Apache-1.0"        : 'http://www.apache.org/licenses/LICENSE-1.0',
@@ -163,21 +130,9 @@ def license_urls(self) -> List[str]:
         return [self.LICENSE_URLS[lic]
                 for lic in self.licenses() if lic in self.LICENSE_URLS]
 
-    @property
-    def redistributable(self) -> bool:
-        for lic in self.licenses():
-            if lic in self.REDISTRIBUTABLE_LICENSES:
-                return True
-        return False
-
     def mirror_item(self, with_epoch: bool = True) -> str:
         return f'{self.identifier}-{self._format_version(with_epoch)}'
 
-    def mirror_filename(self, with_epoch: bool = True) -> Optional[str]:
-        if 'download_hash' not in self._raw:
-            return None
-        return f'{self.download_hash["sha1"][0:8]}-{self.identifier}-{self._format_version(with_epoch)}.{Ckan.MIME_TO_EXTENSION[self.download_content_type]}'
-
     def mirror_source_filename(self, with_epoch: bool = True) -> str:
         return f'{self.identifier}-{self._format_version(with_epoch)}.source.zip'
 
@@ -379,8 +334,9 @@ def purge_epochs(self, dry_run: bool) -> None:
         if dry_run:
             logging.info('Dry run mode enabled, no changes will be made')
         for result in self._epoch_search():
-            ident = result.get('identifier')
-            if ident:
+            if 'ident' in result:
+                # https://internetarchive.readthedocs.io/en/stable/internetarchive.html#internetarchive.Search
+                ident = result['identifier']  # type: ignore[index]
                 item = self.ia_session.get_item(ident)
                 logging.info('Found epoch to purge: %s (%s)', ident, item.metadata.get('title'))
                 if not dry_run:
@@ -391,7 +347,7 @@ def purge_epochs(self, dry_run: bool) -> None:
 
     def _epoch_search(self) -> Iterable[internetarchive.Search]:
         return filter(
-            self._result_has_epoch,
+            self._result_has_epoch,  # type: ignore[arg-type]
             self.ia_session.search_items(
                 f'collection:({self.ia_collection})',
                 fields=['identifier', 'title']