@@ -146,6 +146,7 @@ async def download(
146
146
>>> import skillsnetwork
147
147
>>> path = "./my_file.txt"
148
148
>>> await skillsnetwork.download("https://example.com/myfile", path)
149
+ Saved as './my_file.txt'
149
150
>>> with open(path, "r") as f:
150
151
>>> content = f.read()
151
152
@@ -166,7 +167,7 @@ async def download(
166
167
async for chunk in _get_chunks (url , chunk_size ):
167
168
f .write (chunk )
168
169
if verbose :
169
- print (relpath (path .resolve ()))
170
+ print (f"Saved as ' { relpath (path .resolve ())} '" )
170
171
171
172
172
173
async def read (url : str , chunk_size : int = DEFAULT_CHUNK_SIZE ) -> bytes :
@@ -189,39 +190,46 @@ async def read(url: str, chunk_size: int = DEFAULT_CHUNK_SIZE) -> bytes:
189
190
async def prepare (url : str , path : Optional [str ] = None , verbose : bool = True ) -> None :
190
191
"""
191
192
Prepares a dataset for learners. Downloads a dataset from the given url,
192
- decompresses it if necessary, and symlinks it so it's available in the desired path.
193
+ decompresses it if necessary. If not using jupyterlite, will extract to
194
+ /tmp and and symlink it so it's available at the desired path.
193
195
194
196
>>> import skillsnetwork
195
197
>>> await skillsnetwork.prepare("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML0187EN-SkillsNetwork/labs/module%203/images/images.tar.gz")
198
+ Saved to '.'
196
199
197
200
:param url: The URL to download the dataset from.
198
201
:param path: The path the dataset will be available at. Current working directory by default.
199
202
:raise InvalidURLException: When URL is invalid.
200
203
:raise FileExistsError: it raises this when a file to be symlinked already exists.
201
- :raise ValueError: When requested path is in /tmp.
204
+ :raise ValueError: When requested path is in /tmp, or cannot be saved to path .
202
205
"""
203
206
204
207
filename = Path (urlparse (url ).path ).name
205
208
path = Path .cwd () if path is None else Path (path )
206
209
# Check if path contains /tmp
207
210
if Path ("/tmp" ) in path .parents :
208
211
raise ValueError ("path must not be in /tmp" )
212
+ elif path .is_file ():
213
+ raise ValueError ("Datasets must be prepared to directories, not files" )
209
214
# Create the target path if it doesn't exist yet
210
215
path .mkdir (exist_ok = True )
211
216
212
217
# For avoiding collisions with any other files the user may have downloaded to /tmp/
213
- tmp_extract_dir = Path (f"/tmp/skills-network-{ hash (url )} " )
214
- tmp_download_file = Path (f"/tmp/{ tmp_extract_dir .name } -{ filename } " )
218
+
219
+ dname = f"skills-network-{ hash (url )} "
220
+ # The file to extract data to. If not jupyterlite, to be symlinked to as well
221
+ extract_dir = path if _is_jupyterlite () else Path (f"/tmp/{ dname } " )
222
+ # The file to download the (possibly) compressed data to
223
+ tmp_download_file = Path (f"/tmp/{ dname } -{ filename } " )
215
224
# Download the dataset to tmp_download_file file
216
225
# File will be overwritten if it already exists
217
226
await download (url , tmp_download_file , verbose = False )
218
227
219
- # Delete tmp_extract_dir directory if it already exists
220
- if tmp_extract_dir .is_dir ():
221
- shutil .rmtree (tmp_extract_dir )
222
-
223
- # Create tmp_extract_dir
224
- tmp_extract_dir .mkdir ()
228
+ # Delete extract_dir directory if it already exists
229
+ if not _is_jupyterlite ():
230
+ if extract_dir .is_dir ():
231
+ shutil .rmtree (extract_dir )
232
+ extract_dir .mkdir ()
225
233
226
234
if tarfile .is_tarfile (tmp_download_file ):
227
235
with tarfile .open (tmp_download_file ) as tf :
@@ -235,7 +243,7 @@ async def prepare(url: str, path: Optional[str] = None, verbose: bool = True) ->
235
243
pbar = tqdm (iterable = tf .getmembers (), total = len (tf .getmembers ()))
236
244
pbar .set_description (f"Extracting { filename } " )
237
245
for member in pbar :
238
- tf .extract (member = member , path = tmp_extract_dir )
246
+ tf .extract (member = member , path = extract_dir )
239
247
tmp_download_file .unlink ()
240
248
elif zipfile .is_zipfile (tmp_download_file ):
241
249
with zipfile .ZipFile (tmp_download_file ) as zf :
@@ -249,18 +257,20 @@ async def prepare(url: str, path: Optional[str] = None, verbose: bool = True) ->
249
257
pbar = tqdm (iterable = zf .infolist (), total = len (zf .infolist ()))
250
258
pbar .set_description (f"Extracting { filename } " )
251
259
for member in pbar :
252
- zf .extract (member = member , path = tmp_extract_dir )
260
+ zf .extract (member = member , path = extract_dir )
253
261
tmp_download_file .unlink ()
254
262
else :
255
- _verify_files_dont_exist ([path / tmp_download_file . name ])
256
- pass # No extraction necessary
263
+ _verify_files_dont_exist ([path / filename ])
264
+ shutil . move ( tmp_download_file , extract_dir / filename )
257
265
258
- # Now symlink top-level file objects in tmp_extract_dir
259
- for child in filter (_is_file_to_symlink , tmp_extract_dir .iterdir ()):
260
- (path / child .name ).symlink_to (child , target_is_directory = child .is_dir ())
266
+ # If in jupyterlite environment, the extract_dir = path, so the files are already there.
267
+ if not _is_jupyterlite ():
268
+ # If not in jupyterlite environment, symlink top-level file objects in extract_dir
269
+ for child in filter (_is_file_to_symlink , extract_dir .iterdir ()):
270
+ (path / child .name ).symlink_to (child , target_is_directory = child .is_dir ())
261
271
262
272
if verbose :
263
- print (relpath (path .resolve ()))
273
+ print (f"Saved to ' { relpath (path .resolve ())} '" )
264
274
265
275
266
276
if _is_jupyterlite ():
0 commit comments