|
7 | 7 | """
|
8 | 8 | import os
|
9 | 9 | import json
|
10 |
| -import wget |
| 10 | +import requests |
11 | 11 | import subprocess
|
12 | 12 | import re
|
13 | 13 | import urllib
|
14 | 14 | import urllib.request
|
15 | 15 | import patoolib
|
| 16 | +import time |
16 | 17 |
|
17 | 18 |
|
18 | 19 | class Repo2Data():
|
@@ -194,23 +195,39 @@ def _already_downloaded(self):
|
194 | 195 |
|
195 | 196 | return dl
|
196 | 197 |
|
197 |
| - def _wget_download(self): |
198 |
| - """Install the data with wget library""" |
199 |
| - print("Info : Starting to download with wget %s ..." % |
200 |
| - (self._data_requirement_file["src"])) |
201 |
| - # Try it few times to avoid truncated data |
202 |
| - attempts = 0 |
203 |
| - while attempts < 3: |
204 |
| - # Download with standard weblink |
205 |
| - try: |
206 |
| - wget.download( |
207 |
| - self._data_requirement_file["src"], out=self._dst_path) |
208 |
| - print(" ") |
209 |
| - attempts = 999 |
210 |
| - except urllib.error.ContentTooShortError: |
211 |
| - attempts = attempts + 1 |
212 |
| - print("Warning : Truncated data, retry %d ..." % (attempts)) |
213 |
| - pass |
| 198 | + def _url_download(self): |
| 199 | + """ |
| 200 | + Under the assumption that the download link points to |
| 201 | + a single tar/zip etc file, use requests library to |
| 202 | + downlad the data to a relative path. |
| 203 | + """ |
| 204 | + url = self._data_requirement_file["src"] |
| 205 | + directory = self._dst_path |
| 206 | + max_retries = 3 |
| 207 | + retry_delay = 5 |
| 208 | + for retry in range(max_retries): |
| 209 | + response = requests.get(url, stream=True) |
| 210 | + if response.status_code == 200: |
| 211 | + # Create the directory if it doesn't exist |
| 212 | + if not os.path.exists(directory): |
| 213 | + os.makedirs(directory) |
| 214 | + # Get the filename from the URL |
| 215 | + filename = url.split('/')[-1] |
| 216 | + # Path to save the file |
| 217 | + filepath = os.path.join(directory, filename) |
| 218 | + # Save the content of the response to a file |
| 219 | + with open(filepath, 'wb') as file: |
| 220 | + for chunk in response.iter_content(chunk_size=128): |
| 221 | + file.write(chunk) |
| 222 | + print(f'File downloaded to: {filepath}') |
| 223 | + return filepath |
| 224 | + else: |
| 225 | + print(f'Attempt {retry + 1} - Failed to download the file. Status code: {response.status_code}') |
| 226 | + if retry < max_retries - 1: |
| 227 | + print(f'Retrying in {retry_delay} seconds...') |
| 228 | + time.sleep(retry_delay) |
| 229 | + # If hits here means retries failed. |
| 230 | + print('Download failed after multiple attempts.') |
214 | 231 |
|
215 | 232 | def _gdrive_download(self):
|
216 | 233 | """Install the data with google drive utility"""
|
@@ -291,13 +308,14 @@ def _osf_download(self):
|
291 | 308 |
|
292 | 309 | def _scan_dl_type(self):
|
293 | 310 | """Detect which function to use for download"""
|
294 |
| - # if it is an http link, then we use wget |
| 311 | + # If an http link is provided or the url does not match one of the providers |
| 312 | + # (osf, google, datalad, git), then fall back to requests to download the file. |
295 | 313 | if ((re.match(".*?(https://).*?", self._data_requirement_file["src"])
|
296 | 314 | or re.match(".*?(http://).*?", self._data_requirement_file["src"]))
|
297 | 315 | and not re.match(".*?(\\.git)", self._data_requirement_file["src"])
|
298 | 316 | and not re.match(".*?(drive\\.google\\.com).*?", self._data_requirement_file["src"])
|
299 | 317 | and not re.match(".*?(https://osf\\.io).*?", self._data_requirement_file["src"])):
|
300 |
| - self._wget_download() |
| 318 | + self._url_download() |
301 | 319 | # if the source link has a .git, we use datalad
|
302 | 320 | elif re.match(".*?(\\.git)", self._data_requirement_file["src"]):
|
303 | 321 | self._datalad_download()
|
|
0 commit comments