1
+ import shutil
1
2
import tempfile
3
+ import time
2
4
from pathlib import Path
3
5
from random import choice
4
6
from string import ascii_lowercase
7
9
import requests
8
10
from filelock import FileLock
9
11
from lamin_utils import logger
12
+ from requests .exceptions import RequestException
10
13
from rich .progress import Progress
11
14
12
15
@@ -17,7 +20,10 @@ def _download( # pragma: no cover
17
20
block_size : int = 1024 ,
18
21
overwrite : bool = False ,
19
22
is_zip : bool = False ,
20
- ) -> None :
23
+ timeout : int = 30 ,
24
+ max_retries : int = 3 ,
25
+ retry_delay : int = 5 ,
26
+ ) -> Path :
21
27
"""Downloads a dataset irrespective of the format.
22
28
23
29
Args:
@@ -27,6 +33,9 @@ def _download( # pragma: no cover
27
33
block_size: Block size for downloads in bytes.
28
34
overwrite: Whether to overwrite existing files.
29
35
is_zip: Whether the downloaded file needs to be unzipped.
36
+ timeout: Request timeout in seconds.
37
+ max_retries: Maximum number of retry attempts.
38
+ retry_delay: Delay between retries in seconds.
30
39
"""
31
40
if output_file_name is None :
32
41
letters = ascii_lowercase
@@ -35,36 +44,71 @@ def _download( # pragma: no cover
35
44
if output_path is None :
36
45
output_path = tempfile .gettempdir ()
37
46
38
- download_to_path = (
39
- f"{ output_path } { output_file_name } " if str (output_path ).endswith ("/" ) else f"{ output_path } /{ output_file_name } "
40
- )
47
+ download_to_path = Path (output_path ) / output_file_name
41
48
42
49
Path (output_path ).mkdir (parents = True , exist_ok = True )
43
- lock_path = f"{ output_path } /{ output_file_name } .lock"
44
- with FileLock (lock_path ):
50
+ lock_path = Path (output_path ) / f"{ output_file_name } .lock"
51
+
52
+ with FileLock (lock_path , timeout = 300 ):
45
53
if Path (download_to_path ).exists () and not overwrite :
46
54
logger .warning (f"File { download_to_path } already exists!" )
47
- return
55
+ return download_to_path
56
+
57
+ temp_file_name = Path (f"{ download_to_path } .part" )
58
+
59
+ retry_count = 0
60
+ while retry_count <= max_retries :
61
+ try :
62
+ head_response = requests .head (url , timeout = timeout )
63
+ head_response .raise_for_status ()
64
+ content_length = int (head_response .headers .get ("content-length" , 0 ))
65
+
66
+ free_space = shutil .disk_usage (output_path ).free
67
+ if content_length > free_space :
68
+ raise OSError (
69
+ f"Insufficient disk space. Need { content_length } bytes, but only { free_space } available."
70
+ )
71
+
72
+ response = requests .get (url , stream = True )
73
+ response .raise_for_status ()
74
+ total = int (response .headers .get ("content-length" , 0 ))
48
75
49
- temp_file_name = f"{ download_to_path } .part"
76
+ with Progress (refresh_per_second = 5 ) as progress :
77
+ task = progress .add_task ("[red]Downloading..." , total = total )
78
+ with Path (temp_file_name ).open ("wb" ) as file :
79
+ for data in response .iter_content (block_size ):
80
+ file .write (data )
81
+ progress .update (task , advance = len (data ))
82
+ progress .update (task , completed = total , refresh = True )
50
83
51
- response = requests .get (url , stream = True )
52
- total = int (response .headers .get ("content-length" , 0 ))
84
+ Path (temp_file_name ).replace (download_to_path )
53
85
54
- with Progress (refresh_per_second = 5 ) as progress :
55
- task = progress .add_task ("[red]Downloading..." , total = total )
56
- with Path (temp_file_name ).open ("wb" ) as file :
57
- for data in response .iter_content (block_size ):
58
- file .write (data )
59
- progress .update (task , advance = block_size )
60
- progress .update (task , completed = total , refresh = True )
86
+ if is_zip :
87
+ with ZipFile (download_to_path , "r" ) as zip_obj :
88
+ zip_obj .extractall (path = output_path )
89
+ return Path (output_path )
61
90
62
- Path (temp_file_name ).replace (download_to_path )
91
+ return download_to_path
92
+ except (OSError , RequestException ) as e :
93
+ retry_count += 1
94
+ if retry_count <= max_retries :
95
+ logger .warning (
96
+ f"Download attempt { retry_count } /{ max_retries } failed: { str (e )} . Retrying in { retry_delay } seconds..."
97
+ )
98
+ time .sleep (retry_delay )
99
+ else :
100
+ logger .error (f"Download failed after { max_retries } attempts: { str (e )} " )
101
+ if Path (temp_file_name ).exists ():
102
+ Path (temp_file_name ).unlink (missing_ok = True )
103
+ raise
63
104
64
- if is_zip :
65
- output_path = output_path or tempfile .gettempdir ()
66
- with ZipFile (download_to_path , "r" ) as zip_obj :
67
- zip_obj .extractall (path = output_path )
68
- zip_obj .namelist ()
105
+ except Exception as e :
106
+ logger .error (f"Download failed: { str (e )} " )
107
+ if Path (temp_file_name ).exists ():
108
+ Path (temp_file_name ).unlink (missing_ok = True )
109
+ raise
110
+ finally :
111
+ if Path (temp_file_name ).exists ():
112
+ Path (temp_file_name ).unlink (missing_ok = True )
69
113
70
- Path (lock_path ). unlink ( )
114
+ return Path (download_to_path )
0 commit comments