More robust download (#721)

Zethson · web-flow · commit c6baf5f9120d · 2025-03-04T09:15:02.000+01:00
* Support Python 3.13

* Remove scikit-misc

* Add  scikit-misc

* docs

* More robust download

* Further improvements

* Improved download

* Remove comments
diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml
@@ -1,5 +1,5 @@
-name-template: "0.10.0 🌈"
-tag-template: 0.10.0
+name-template: "0.10.1 🌈"
+tag-template: 0.10.1
 exclude-labels:
     - "skip-changelog"
 
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -27,10 +27,10 @@ jobs:
                     - os: ubuntu-22.04
                       python: "3.13"
                       run_mode: "fast"
-                    # - os: ubuntu-latest
-                    #   python: "3.13"
-                    #   run_mode: slow
-                    #   pip-flags: "--pre"
+                    - os: ubuntu-latest
+                      python: "3.13"
+                      run_mode: slow
+                      pip-flags: "--pre"
 
         env:
             OS: ${{ matrix.os }}
diff --git a/pertpy/__init__.py b/pertpy/__init__.py
@@ -2,7 +2,7 @@
 
 __author__ = "Lukas Heumos"
 __email__ = "lukas.heumos@posteo.net"
-__version__ = "0.10.0"
+__version__ = "0.10.1"
 
 import warnings
 
diff --git a/pertpy/data/_dataloader.py b/pertpy/data/_dataloader.py
@@ -1,4 +1,6 @@
+import shutil
 import tempfile
+import time
 from pathlib import Path
 from random import choice
 from string import ascii_lowercase
@@ -7,6 +9,7 @@
 import requests
 from filelock import FileLock
 from lamin_utils import logger
+from requests.exceptions import RequestException
 from rich.progress import Progress
 
 
@@ -17,7 +20,10 @@ def _download(  # pragma: no cover
     block_size: int = 1024,
     overwrite: bool = False,
     is_zip: bool = False,
-) -> None:
+    timeout: int = 30,
+    max_retries: int = 3,
+    retry_delay: int = 5,
+) -> Path:
     """Downloads a dataset irrespective of the format.
 
     Args:
@@ -27,6 +33,9 @@ def _download(  # pragma: no cover
         block_size: Block size for downloads in bytes.
         overwrite: Whether to overwrite existing files.
         is_zip: Whether the downloaded file needs to be unzipped.
+        timeout: Request timeout in seconds.
+        max_retries: Maximum number of retry attempts.
+        retry_delay: Delay between retries in seconds.
     """
     if output_file_name is None:
         letters = ascii_lowercase
@@ -35,36 +44,71 @@ def _download(  # pragma: no cover
     if output_path is None:
         output_path = tempfile.gettempdir()
 
-    download_to_path = (
-        f"{output_path}{output_file_name}" if str(output_path).endswith("/") else f"{output_path}/{output_file_name}"
-    )
+    download_to_path = Path(output_path) / output_file_name
 
     Path(output_path).mkdir(parents=True, exist_ok=True)
-    lock_path = f"{output_path}/{output_file_name}.lock"
-    with FileLock(lock_path):
+    lock_path = Path(output_path) / f"{output_file_name}.lock"
+
+    with FileLock(lock_path, timeout=300):
         if Path(download_to_path).exists() and not overwrite:
             logger.warning(f"File {download_to_path} already exists!")
-            return
+            return download_to_path
+
+        temp_file_name = Path(f"{download_to_path}.part")
+
+        retry_count = 0
+        while retry_count <= max_retries:
+            try:
+                head_response = requests.head(url, timeout=timeout)
+                head_response.raise_for_status()
+                content_length = int(head_response.headers.get("content-length", 0))
+
+                free_space = shutil.disk_usage(output_path).free
+                if content_length > free_space:
+                    raise OSError(
+                        f"Insufficient disk space. Need {content_length} bytes, but only {free_space} available."
+                    )
+
+                response = requests.get(url, stream=True)
+                response.raise_for_status()
+                total = int(response.headers.get("content-length", 0))
 
-        temp_file_name = f"{download_to_path}.part"
+                with Progress(refresh_per_second=5) as progress:
+                    task = progress.add_task("[red]Downloading...", total=total)
+                    with Path(temp_file_name).open("wb") as file:
+                        for data in response.iter_content(block_size):
+                            file.write(data)
+                            progress.update(task, advance=len(data))
+                        progress.update(task, completed=total, refresh=True)
 
-        response = requests.get(url, stream=True)
-        total = int(response.headers.get("content-length", 0))
+                Path(temp_file_name).replace(download_to_path)
 
-        with Progress(refresh_per_second=5) as progress:
-            task = progress.add_task("[red]Downloading...", total=total)
-            with Path(temp_file_name).open("wb") as file:
-                for data in response.iter_content(block_size):
-                    file.write(data)
-                    progress.update(task, advance=block_size)
-            progress.update(task, completed=total, refresh=True)
+                if is_zip:
+                    with ZipFile(download_to_path, "r") as zip_obj:
+                        zip_obj.extractall(path=output_path)
+                    return Path(output_path)
 
-        Path(temp_file_name).replace(download_to_path)
+                return download_to_path
+            except (OSError, RequestException) as e:
+                retry_count += 1
+                if retry_count <= max_retries:
+                    logger.warning(
+                        f"Download attempt {retry_count}/{max_retries} failed: {str(e)}. Retrying in {retry_delay} seconds..."
+                    )
+                    time.sleep(retry_delay)
+                else:
+                    logger.error(f"Download failed after {max_retries} attempts: {str(e)}")
+                    if Path(temp_file_name).exists():
+                        Path(temp_file_name).unlink(missing_ok=True)
+                    raise
 
-        if is_zip:
-            output_path = output_path or tempfile.gettempdir()
-            with ZipFile(download_to_path, "r") as zip_obj:
-                zip_obj.extractall(path=output_path)
-                zip_obj.namelist()
+            except Exception as e:
+                logger.error(f"Download failed: {str(e)}")
+                if Path(temp_file_name).exists():
+                    Path(temp_file_name).unlink(missing_ok=True)
+                raise
+            finally:
+                if Path(temp_file_name).exists():
+                    Path(temp_file_name).unlink(missing_ok=True)
 
-    Path(lock_path).unlink()
+        return Path(download_to_path)
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ requires = ["hatchling"]
 
 [project]
 name = "pertpy"
-version = "0.10.0"
+version = "0.10.1"
 description = "Perturbation Analysis in the scverse ecosystem."
 readme = "README.md"
 requires-python = ">=3.10,<3.14"