Skip to content

Commit 4b6f5a3

Browse files
Update prepare to work in jupyterlite (#1)
* Update prepare to work in jupyterlite * Format code * Remove test file from root dir * Minor changes * Very nice * Update docs * Add quotation marks in msg * Minor changes * Extracting directories works in jupyterlite * Works in jupyterlite albeit with a performance hit
1 parent 2103bcf commit 4b6f5a3

File tree

4 files changed

+54
-33
lines changed

4 files changed

+54
-33
lines changed

doc/source/usage.rst

+5-5
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,11 @@ Use the :func:`skillsnetwork.download` function to download files:
5858
import skillsnetwork
5959
await skillsnetwork.download("https://www.example.com/my/file.json")
6060
61-
By default, the saved path will be printed:
61+
By default, the saved path will be printed (See the :func:`skillsnetwork.download` api to change):
6262

6363
.. code-block:: console
6464
65-
./file.json
65+
Saved as './file.json'
6666
6767
This confirms the file is saved in your lab environment:
6868

@@ -82,18 +82,18 @@ Use the :func:`skillsnetwork.prepare` to manage large compressed datasets or dat
8282
import skillsnetwork
8383
await skillsnetwork.prepare("https://www.example.com/my/images.zip")
8484
85-
By default, the saved path will be printed:
85+
By default, the location the extracted data is saved will be printed:
8686

8787
.. code-block:: console
8888
89-
.
89+
Saved to '.'
9090
9191
This confirms the dataset was extracted to your current working directory in your lab environment:
9292

9393
.. code-block:: python
9494
9595
from pathlib import Path
96-
for path in Path(".").rglob("*"):
96+
for path in Path(".").iterdir():
9797
print(path)
9898
9999
.. code-block:: console

skillsnetwork/core.py

+29-19
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ async def download(
146146
>>> import skillsnetwork
147147
>>> path = "./my_file.txt"
148148
>>> await skillsnetwork.download("https://example.com/myfile", path)
149+
Saved as './my_file.txt'
149150
>>> with open(path, "r") as f:
150151
>>> content = f.read()
151152
@@ -166,7 +167,7 @@ async def download(
166167
async for chunk in _get_chunks(url, chunk_size):
167168
f.write(chunk)
168169
if verbose:
169-
print(relpath(path.resolve()))
170+
print(f"Saved as '{relpath(path.resolve())}'")
170171

171172

172173
async def read(url: str, chunk_size: int = DEFAULT_CHUNK_SIZE) -> bytes:
@@ -189,39 +190,46 @@ async def read(url: str, chunk_size: int = DEFAULT_CHUNK_SIZE) -> bytes:
189190
async def prepare(url: str, path: Optional[str] = None, verbose: bool = True) -> None:
190191
"""
191192
Prepares a dataset for learners. Downloads a dataset from the given url,
192-
decompresses it if necessary, and symlinks it so it's available in the desired path.
193+
decompresses it if necessary. If not using jupyterlite, will extract to
194+
/tmp and and symlink it so it's available at the desired path.
193195
194196
>>> import skillsnetwork
195197
>>> await skillsnetwork.prepare("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML0187EN-SkillsNetwork/labs/module%203/images/images.tar.gz")
198+
Saved to '.'
196199
197200
:param url: The URL to download the dataset from.
198201
:param path: The path the dataset will be available at. Current working directory by default.
199202
:raise InvalidURLException: When URL is invalid.
200203
:raise FileExistsError: it raises this when a file to be symlinked already exists.
201-
:raise ValueError: When requested path is in /tmp.
204+
:raise ValueError: When requested path is in /tmp, or cannot be saved to path.
202205
"""
203206

204207
filename = Path(urlparse(url).path).name
205208
path = Path.cwd() if path is None else Path(path)
206209
# Check if path contains /tmp
207210
if Path("/tmp") in path.parents:
208211
raise ValueError("path must not be in /tmp")
212+
elif path.is_file():
213+
raise ValueError("Datasets must be prepared to directories, not files")
209214
# Create the target path if it doesn't exist yet
210215
path.mkdir(exist_ok=True)
211216

212217
# For avoiding collisions with any other files the user may have downloaded to /tmp/
213-
tmp_extract_dir = Path(f"/tmp/skills-network-{hash(url)}")
214-
tmp_download_file = Path(f"/tmp/{tmp_extract_dir.name}-{filename}")
218+
219+
dname = f"skills-network-{hash(url)}"
220+
# The file to extract data to. If not jupyterlite, to be symlinked to as well
221+
extract_dir = path if _is_jupyterlite() else Path(f"/tmp/{dname}")
222+
# The file to download the (possibly) compressed data to
223+
tmp_download_file = Path(f"/tmp/{dname}-{filename}")
215224
# Download the dataset to tmp_download_file file
216225
# File will be overwritten if it already exists
217226
await download(url, tmp_download_file, verbose=False)
218227

219-
# Delete tmp_extract_dir directory if it already exists
220-
if tmp_extract_dir.is_dir():
221-
shutil.rmtree(tmp_extract_dir)
222-
223-
# Create tmp_extract_dir
224-
tmp_extract_dir.mkdir()
228+
# Delete extract_dir directory if it already exists
229+
if not _is_jupyterlite():
230+
if extract_dir.is_dir():
231+
shutil.rmtree(extract_dir)
232+
extract_dir.mkdir()
225233

226234
if tarfile.is_tarfile(tmp_download_file):
227235
with tarfile.open(tmp_download_file) as tf:
@@ -235,7 +243,7 @@ async def prepare(url: str, path: Optional[str] = None, verbose: bool = True) ->
235243
pbar = tqdm(iterable=tf.getmembers(), total=len(tf.getmembers()))
236244
pbar.set_description(f"Extracting {filename}")
237245
for member in pbar:
238-
tf.extract(member=member, path=tmp_extract_dir)
246+
tf.extract(member=member, path=extract_dir)
239247
tmp_download_file.unlink()
240248
elif zipfile.is_zipfile(tmp_download_file):
241249
with zipfile.ZipFile(tmp_download_file) as zf:
@@ -249,18 +257,20 @@ async def prepare(url: str, path: Optional[str] = None, verbose: bool = True) ->
249257
pbar = tqdm(iterable=zf.infolist(), total=len(zf.infolist()))
250258
pbar.set_description(f"Extracting {filename}")
251259
for member in pbar:
252-
zf.extract(member=member, path=tmp_extract_dir)
260+
zf.extract(member=member, path=extract_dir)
253261
tmp_download_file.unlink()
254262
else:
255-
_verify_files_dont_exist([path / tmp_download_file.name])
256-
pass # No extraction necessary
263+
_verify_files_dont_exist([path / filename])
264+
shutil.move(tmp_download_file, extract_dir / filename)
257265

258-
# Now symlink top-level file objects in tmp_extract_dir
259-
for child in filter(_is_file_to_symlink, tmp_extract_dir.iterdir()):
260-
(path / child.name).symlink_to(child, target_is_directory=child.is_dir())
266+
# If in jupyterlite environment, the extract_dir = path, so the files are already there.
267+
if not _is_jupyterlite():
268+
# If not in jupyterlite environment, symlink top-level file objects in extract_dir
269+
for child in filter(_is_file_to_symlink, extract_dir.iterdir()):
270+
(path / child.name).symlink_to(child, target_is_directory=child.is_dir())
261271

262272
if verbose:
263-
print(relpath(path.resolve()))
273+
print(f"Saved to '{relpath(path.resolve())}'")
264274

265275

266276
if _is_jupyterlite():

tests/test.csv

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
2+
2008-12-01,Albury,13.4,22.9,0.6,NA,NA,W,44,W,WNW,20,24,71,22,1007.7,1007.1,8,NA,16.9,21.8,No,0,No
3+
2008-12-02,Albury,7.4,25.1,0,NA,NA,WNW,44,NNW,WSW,4,22,44,25,1010.6,1007.8,NA,NA,17.2,24.3,No,0,No
4+
2008-12-03,Albury,12.9,25.7,0,NA,NA,WSW,46,W,WSW,19,26,38,30,1007.6,1008.7,NA,2,21,23.2,No,0,No

tests/test_skillsnetwork.py

+16-9
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
import os
22
import shutil
3-
from random import choice
4-
from string import ascii_uppercase
5-
63
import pytest
74
import skillsnetwork
5+
6+
from pathlib import Path
87
from pytest_httpserver import httpserver
8+
from random import choice
9+
from string import ascii_uppercase
910

1011

1112
def test_backwards_compatibility():
@@ -53,7 +54,6 @@ async def test_prepare_dataset_invalid_url():
5354

5455
@pytest.mark.asyncio
5556
async def test_prepare_dataset_tar_no_path(httpserver):
56-
5757
url = "/test.tar.gz"
5858
expected_directory = "test"
5959
try:
@@ -74,10 +74,8 @@ async def test_prepare_dataset_tar_no_path(httpserver):
7474

7575
@pytest.mark.asyncio
7676
async def test_prepare_dataset_tar_with_path(httpserver):
77-
7877
url = "/test.tar.gz"
7978
path = "example"
80-
8179
try:
8280
shutil.rmtree(path) # clean up any previous test
8381
except FileNotFoundError:
@@ -92,7 +90,6 @@ async def test_prepare_dataset_tar_with_path(httpserver):
9290

9391
@pytest.mark.asyncio
9492
async def test_prepare_dataset_zip_no_path(httpserver):
95-
9693
url = "/test.zip"
9794
expected_directory = "test"
9895
try:
@@ -113,10 +110,8 @@ async def test_prepare_dataset_zip_no_path(httpserver):
113110

114111
@pytest.mark.asyncio
115112
async def test_prepare_dataset_zip_with_path(httpserver):
116-
117113
url = "/test.zip"
118114
path = "tests/example"
119-
120115
try:
121116
shutil.rmtree(path) # clean up any previous test
122117
except FileNotFoundError:
@@ -127,3 +122,15 @@ async def test_prepare_dataset_zip_with_path(httpserver):
127122
await skillsnetwork.prepare_dataset(httpserver.url_for(url), path=path)
128123
assert os.path.isdir(path)
129124
shutil.rmtree(path)
125+
126+
127+
@pytest.mark.asyncio
128+
async def test_prepare_non_compressed_dataset_with_path(httpserver):
129+
url = "/test.csv"
130+
path = "."
131+
expected_path = Path("./test.csv")
132+
with open("tests/test.csv", "rb") as expected_data:
133+
httpserver.expect_request(url).respond_with_data(expected_data)
134+
await skillsnetwork.prepare_dataset(httpserver.url_for(url), path=path)
135+
assert expected_path.exists()
136+
expected_path.unlink()

0 commit comments

Comments
 (0)