Skip to content

Commit

Permalink
More robust download (#721)
Browse files Browse the repository at this point in the history
* Support Python 3.13

* Remove scikit-misc

* Add  scikit-misc

* docs

* More robust download

* Further improvements

* Improved download

* Remove comments
  • Loading branch information
Zethson authored Mar 4, 2025
1 parent 058f2dc commit c6baf5f
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 32 deletions.
4 changes: 2 additions & 2 deletions .github/release-drafter.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name-template: "0.10.0 🌈"
tag-template: 0.10.0
name-template: "0.10.1 🌈"
tag-template: 0.10.1
exclude-labels:
- "skip-changelog"

Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@ jobs:
- os: ubuntu-22.04
python: "3.13"
run_mode: "fast"
# - os: ubuntu-latest
# python: "3.13"
# run_mode: slow
# pip-flags: "--pre"
- os: ubuntu-latest
python: "3.13"
run_mode: slow
pip-flags: "--pre"

env:
OS: ${{ matrix.os }}
Expand Down
2 changes: 1 addition & 1 deletion pertpy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

__author__ = "Lukas Heumos"
__email__ = "[email protected]"
__version__ = "0.10.0"
__version__ = "0.10.1"

import warnings

Expand Down
92 changes: 68 additions & 24 deletions pertpy/data/_dataloader.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import shutil
import tempfile
import time
from pathlib import Path
from random import choice
from string import ascii_lowercase
Expand All @@ -7,6 +9,7 @@
import requests
from filelock import FileLock
from lamin_utils import logger
from requests.exceptions import RequestException
from rich.progress import Progress


Expand All @@ -17,7 +20,10 @@ def _download( # pragma: no cover
block_size: int = 1024,
overwrite: bool = False,
is_zip: bool = False,
) -> None:
timeout: int = 30,
max_retries: int = 3,
retry_delay: int = 5,
) -> Path:
"""Downloads a dataset irrespective of the format.
Args:
Expand All @@ -27,6 +33,9 @@ def _download( # pragma: no cover
block_size: Block size for downloads in bytes.
overwrite: Whether to overwrite existing files.
is_zip: Whether the downloaded file needs to be unzipped.
timeout: Request timeout in seconds.
max_retries: Maximum number of retry attempts.
retry_delay: Delay between retries in seconds.
"""
if output_file_name is None:
letters = ascii_lowercase
Expand All @@ -35,36 +44,71 @@ def _download( # pragma: no cover
if output_path is None:
output_path = tempfile.gettempdir()

download_to_path = (
f"{output_path}{output_file_name}" if str(output_path).endswith("/") else f"{output_path}/{output_file_name}"
)
download_to_path = Path(output_path) / output_file_name

Path(output_path).mkdir(parents=True, exist_ok=True)
lock_path = f"{output_path}/{output_file_name}.lock"
with FileLock(lock_path):
lock_path = Path(output_path) / f"{output_file_name}.lock"

with FileLock(lock_path, timeout=300):
if Path(download_to_path).exists() and not overwrite:
logger.warning(f"File {download_to_path} already exists!")
return
return download_to_path

temp_file_name = Path(f"{download_to_path}.part")

retry_count = 0
while retry_count <= max_retries:
try:
head_response = requests.head(url, timeout=timeout)
head_response.raise_for_status()
content_length = int(head_response.headers.get("content-length", 0))

free_space = shutil.disk_usage(output_path).free
if content_length > free_space:
raise OSError(
f"Insufficient disk space. Need {content_length} bytes, but only {free_space} available."
)

response = requests.get(url, stream=True)
response.raise_for_status()
total = int(response.headers.get("content-length", 0))

temp_file_name = f"{download_to_path}.part"
with Progress(refresh_per_second=5) as progress:
task = progress.add_task("[red]Downloading...", total=total)
with Path(temp_file_name).open("wb") as file:
for data in response.iter_content(block_size):
file.write(data)
progress.update(task, advance=len(data))
progress.update(task, completed=total, refresh=True)

response = requests.get(url, stream=True)
total = int(response.headers.get("content-length", 0))
Path(temp_file_name).replace(download_to_path)

with Progress(refresh_per_second=5) as progress:
task = progress.add_task("[red]Downloading...", total=total)
with Path(temp_file_name).open("wb") as file:
for data in response.iter_content(block_size):
file.write(data)
progress.update(task, advance=block_size)
progress.update(task, completed=total, refresh=True)
if is_zip:
with ZipFile(download_to_path, "r") as zip_obj:
zip_obj.extractall(path=output_path)
return Path(output_path)

Path(temp_file_name).replace(download_to_path)
return download_to_path
except (OSError, RequestException) as e:
retry_count += 1
if retry_count <= max_retries:
logger.warning(
f"Download attempt {retry_count}/{max_retries} failed: {str(e)}. Retrying in {retry_delay} seconds..."
)
time.sleep(retry_delay)
else:
logger.error(f"Download failed after {max_retries} attempts: {str(e)}")
if Path(temp_file_name).exists():
Path(temp_file_name).unlink(missing_ok=True)
raise

if is_zip:
output_path = output_path or tempfile.gettempdir()
with ZipFile(download_to_path, "r") as zip_obj:
zip_obj.extractall(path=output_path)
zip_obj.namelist()
except Exception as e:
logger.error(f"Download failed: {str(e)}")
if Path(temp_file_name).exists():
Path(temp_file_name).unlink(missing_ok=True)
raise
finally:
if Path(temp_file_name).exists():
Path(temp_file_name).unlink(missing_ok=True)

Path(lock_path).unlink()
return Path(download_to_path)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ requires = ["hatchling"]

[project]
name = "pertpy"
version = "0.10.0"
version = "0.10.1"
description = "Perturbation Analysis in the scverse ecosystem."
readme = "README.md"
requires-python = ">=3.10,<3.14"
Expand Down

0 comments on commit c6baf5f

Please sign in to comment.