Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make download_hash.sha1 optional #344

Merged
merged 1 commit into from
Oct 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion netkan/netkan/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from hashlib import sha1
import uuid
import urllib.parse
from string import Template
from typing import Optional, List, Tuple, Union, Any, Dict, TYPE_CHECKING
from ruamel.yaml import YAML
import dateutil.parser
Expand Down Expand Up @@ -320,6 +321,7 @@ def __str__(self) -> str:
ISODATETIME_PROPERTIES = [
'release_date'
]
MIRROR_FILENAME_TEMPLATE = Template('$prefix-$identifier-$version.$extension')

def __init__(self, filename: Optional[Union[str, Path]] = None, contents: Optional[str] = None) -> None:
if filename:
Expand Down Expand Up @@ -438,7 +440,11 @@ def redistributable(self) -> bool:
def mirror_filename(self, with_epoch: bool = True) -> Optional[str]:
if 'download_hash' not in self._raw:
return None
return f'{self.download_hash["sha1"][0:8]}-{self.identifier}-{self._format_version(with_epoch)}.{Ckan.MIME_TO_EXTENSION[self.download_content_type]}'
return self.MIRROR_FILENAME_TEMPLATE.safe_substitute(
prefix=self._mirror_prefix(),
identifier=self.identifier,
version=self._format_version(with_epoch),
extension=Ckan.MIME_TO_EXTENSION[self.download_content_type])

def mirror_download(self, with_epoch: bool = True) -> Optional[str]:
filename = self.mirror_filename(with_epoch)
Expand All @@ -450,6 +456,12 @@ def mirror_item(self, with_epoch: bool = True) -> str:
return self._ia_bucket_sanitize(
f'{self.identifier}-{self._format_version(with_epoch)}')

def _mirror_prefix(self) -> str:
return (self.download_hash['sha1']
if 'sha1' in self.download_hash
else self.download_hash['sha256']
)[0:8]

# InternetArchive says:
# Bucket names should be valid archive identifiers;
# try someting matching this regular expression:
Expand Down
25 changes: 23 additions & 2 deletions netkan/netkan/mirrorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,22 @@ def mirrored(self, iarchive: internetarchive.session.ArchiveSession) -> bool:
return False
if not item.exists:
return False
sha1 = self.download_hash['sha1'].lower()
return any(file['sha1'].lower() == sha1 for file in item.files if 'sha1' in file)
sha1 = self._sha1()
if sha1 is None:
return False
return any(file['sha1'].lower() == sha1
for file in item.files
if 'sha1' in file)

def _sha1(self) -> Optional[str]:
if 'sha1' in self.download_hash:
# Use hash from metadata if set
return self.download_hash['sha1'].lower()
dl_io = self.open_download()
if dl_io is not None:
# Calculate hash from file if found
return self.large_file_sha1(dl_io)
return None

def license_urls(self) -> List[str]:
return [self.LICENSE_URLS[lic]
Expand Down Expand Up @@ -158,6 +172,13 @@ def large_file_sha256(file: BinaryIO, block_size: int = 8192) -> str:
sha.update(block)
return sha.hexdigest().upper()

@staticmethod
def large_file_sha1(file: BinaryIO, block_size: int = 8192) -> str:
sha = hashlib.sha1()
for block in iter(lambda: file.read(block_size), b''):
sha.update(block)
return sha.hexdigest().upper()

def open_if_hash_match(self, path: Path) -> Optional[BinaryIO]:
"""Check whether the file located at the given path matches our sha256.

Expand Down