Update prepare to work in jupyterlite (#1)

SamProkopchuk · web-flow · commit 4b6f5a3e67bd · 2022-07-11T10:18:26.000-04:00
* Update prepare to work in jupyterlite

* Format code

* Remove test file from root dir

* Minor changes

* Very nice

* Update docs

* Add quotation marks in msg

* Minor changes

* Extracting directories works in jupyterlite

* Works in jupyterlite albeit with a performance hit
diff --git a/doc/source/usage.rst b/doc/source/usage.rst
@@ -58,11 +58,11 @@ Use the :func:`skillsnetwork.download` function to download files:
    import skillsnetwork
    await skillsnetwork.download("https://www.example.com/my/file.json")
 
-By default, the saved path will be printed:
+By default, the saved path will be printed (See the :func:`skillsnetwork.download` api to change):
 
 .. code-block:: console
 
-   ./file.json
+   Saved as './file.json'
 
 This confirms the file is saved in your lab environment:
 
@@ -82,18 +82,18 @@ Use the :func:`skillsnetwork.prepare` to manage large compressed datasets or dat
    import skillsnetwork
    await skillsnetwork.prepare("https://www.example.com/my/images.zip")
 
-By default, the saved path will be printed:
+By default, the location the extracted data is saved will be printed:
 
 .. code-block:: console
 
-   .
+   Saved to '.'
 
 This confirms the dataset was extracted to your current working directory in your lab environment:
 
 .. code-block:: python
    
    from pathlib import Path
-   for path in Path(".").rglob("*"):
+   for path in Path(".").iterdir():
        print(path)
 
 .. code-block:: console
diff --git a/skillsnetwork/core.py b/skillsnetwork/core.py
@@ -146,6 +146,7 @@ async def download(
     >>> import skillsnetwork
     >>> path = "./my_file.txt"
     >>> await skillsnetwork.download("https://example.com/myfile", path)
+    Saved as './my_file.txt'
     >>> with open(path, "r") as f:
     >>>     content = f.read()
 
@@ -166,7 +167,7 @@ async def download(
         async for chunk in _get_chunks(url, chunk_size):
             f.write(chunk)
     if verbose:
-        print(relpath(path.resolve()))
+        print(f"Saved as '{relpath(path.resolve())}'")
 
 
 async def read(url: str, chunk_size: int = DEFAULT_CHUNK_SIZE) -> bytes:
@@ -189,39 +190,46 @@ async def read(url: str, chunk_size: int = DEFAULT_CHUNK_SIZE) -> bytes:
 async def prepare(url: str, path: Optional[str] = None, verbose: bool = True) -> None:
     """
     Prepares a dataset for learners. Downloads a dataset from the given url,
-    decompresses it if necessary, and symlinks it so it's available in the desired path.
+    decompresses it if necessary. If not using jupyterlite, will extract to
+    /tmp and and symlink it so it's available at the desired path.
 
     >>> import skillsnetwork
     >>> await skillsnetwork.prepare("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML0187EN-SkillsNetwork/labs/module%203/images/images.tar.gz")
+    Saved to '.'
 
     :param url: The URL to download the dataset from.
     :param path: The path the dataset will be available at. Current working directory by default.
     :raise InvalidURLException: When URL is invalid.
     :raise FileExistsError: it raises this when a file to be symlinked already exists.
-    :raise ValueError: When requested path is in /tmp.
+    :raise ValueError: When requested path is in /tmp, or cannot be saved to path.
     """
 
     filename = Path(urlparse(url).path).name
     path = Path.cwd() if path is None else Path(path)
     # Check if path contains /tmp
     if Path("/tmp") in path.parents:
         raise ValueError("path must not be in /tmp")
+    elif path.is_file():
+        raise ValueError("Datasets must be prepared to directories, not files")
     # Create the target path if it doesn't exist yet
     path.mkdir(exist_ok=True)
 
     # For avoiding collisions with any other files the user may have downloaded to /tmp/
-    tmp_extract_dir = Path(f"/tmp/skills-network-{hash(url)}")
-    tmp_download_file = Path(f"/tmp/{tmp_extract_dir.name}-{filename}")
+
+    dname = f"skills-network-{hash(url)}"
+    # The file to extract data to. If not jupyterlite, to be symlinked to as well
+    extract_dir = path if _is_jupyterlite() else Path(f"/tmp/{dname}")
+    # The file to download the (possibly) compressed data to
+    tmp_download_file = Path(f"/tmp/{dname}-{filename}")
     # Download the dataset to tmp_download_file file
     # File will be overwritten if it already exists
     await download(url, tmp_download_file, verbose=False)
 
-    # Delete tmp_extract_dir directory if it already exists
-    if tmp_extract_dir.is_dir():
-        shutil.rmtree(tmp_extract_dir)
-
-    # Create tmp_extract_dir
-    tmp_extract_dir.mkdir()
+    # Delete extract_dir directory if it already exists
+    if not _is_jupyterlite():
+        if extract_dir.is_dir():
+            shutil.rmtree(extract_dir)
+        extract_dir.mkdir()
 
     if tarfile.is_tarfile(tmp_download_file):
         with tarfile.open(tmp_download_file) as tf:
@@ -235,7 +243,7 @@ async def prepare(url: str, path: Optional[str] = None, verbose: bool = True) ->
             pbar = tqdm(iterable=tf.getmembers(), total=len(tf.getmembers()))
             pbar.set_description(f"Extracting {filename}")
             for member in pbar:
-                tf.extract(member=member, path=tmp_extract_dir)
+                tf.extract(member=member, path=extract_dir)
         tmp_download_file.unlink()
     elif zipfile.is_zipfile(tmp_download_file):
         with zipfile.ZipFile(tmp_download_file) as zf:
@@ -249,18 +257,20 @@ async def prepare(url: str, path: Optional[str] = None, verbose: bool = True) ->
             pbar = tqdm(iterable=zf.infolist(), total=len(zf.infolist()))
             pbar.set_description(f"Extracting {filename}")
             for member in pbar:
-                zf.extract(member=member, path=tmp_extract_dir)
+                zf.extract(member=member, path=extract_dir)
         tmp_download_file.unlink()
     else:
-        _verify_files_dont_exist([path / tmp_download_file.name])
-        pass  # No extraction necessary
+        _verify_files_dont_exist([path / filename])
+        shutil.move(tmp_download_file, extract_dir / filename)
 
-    # Now symlink top-level file objects in tmp_extract_dir
-    for child in filter(_is_file_to_symlink, tmp_extract_dir.iterdir()):
-        (path / child.name).symlink_to(child, target_is_directory=child.is_dir())
+    # If in jupyterlite environment, the extract_dir = path, so the files are already there.
+    if not _is_jupyterlite():
+        # If not in jupyterlite environment, symlink top-level file objects in extract_dir
+        for child in filter(_is_file_to_symlink, extract_dir.iterdir()):
+            (path / child.name).symlink_to(child, target_is_directory=child.is_dir())
 
     if verbose:
-        print(relpath(path.resolve()))
+        print(f"Saved to '{relpath(path.resolve())}'")
 
 
 if _is_jupyterlite():
diff --git a/tests/test.csv b/tests/test.csv
@@ -0,0 +1,4 @@
+Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
+2008-12-01,Albury,13.4,22.9,0.6,NA,NA,W,44,W,WNW,20,24,71,22,1007.7,1007.1,8,NA,16.9,21.8,No,0,No
+2008-12-02,Albury,7.4,25.1,0,NA,NA,WNW,44,NNW,WSW,4,22,44,25,1010.6,1007.8,NA,NA,17.2,24.3,No,0,No
+2008-12-03,Albury,12.9,25.7,0,NA,NA,WSW,46,W,WSW,19,26,38,30,1007.6,1008.7,NA,2,21,23.2,No,0,No
diff --git a/tests/test_skillsnetwork.py b/tests/test_skillsnetwork.py
@@ -1,11 +1,12 @@
 import os
 import shutil
-from random import choice
-from string import ascii_uppercase
-
 import pytest
 import skillsnetwork
+
+from pathlib import Path
 from pytest_httpserver import httpserver
+from random import choice
+from string import ascii_uppercase
 
 
 def test_backwards_compatibility():
@@ -53,7 +54,6 @@ async def test_prepare_dataset_invalid_url():
 
 @pytest.mark.asyncio
 async def test_prepare_dataset_tar_no_path(httpserver):
-
     url = "/test.tar.gz"
     expected_directory = "test"
     try:
@@ -74,10 +74,8 @@ async def test_prepare_dataset_tar_no_path(httpserver):
 
 @pytest.mark.asyncio
 async def test_prepare_dataset_tar_with_path(httpserver):
-
     url = "/test.tar.gz"
     path = "example"
-
     try:
         shutil.rmtree(path)  # clean up any previous test
     except FileNotFoundError:
@@ -92,7 +90,6 @@ async def test_prepare_dataset_tar_with_path(httpserver):
 
 @pytest.mark.asyncio
 async def test_prepare_dataset_zip_no_path(httpserver):
-
     url = "/test.zip"
     expected_directory = "test"
     try:
@@ -113,10 +110,8 @@ async def test_prepare_dataset_zip_no_path(httpserver):
 
 @pytest.mark.asyncio
 async def test_prepare_dataset_zip_with_path(httpserver):
-
     url = "/test.zip"
     path = "tests/example"
-
     try:
         shutil.rmtree(path)  # clean up any previous test
     except FileNotFoundError:
@@ -127,3 +122,15 @@ async def test_prepare_dataset_zip_with_path(httpserver):
         await skillsnetwork.prepare_dataset(httpserver.url_for(url), path=path)
     assert os.path.isdir(path)
     shutil.rmtree(path)
+
+
+@pytest.mark.asyncio
+async def test_prepare_non_compressed_dataset_with_path(httpserver):
+    url = "/test.csv"
+    path = "."
+    expected_path = Path("./test.csv")
+    with open("tests/test.csv", "rb") as expected_data:
+        httpserver.expect_request(url).respond_with_data(expected_data)
+        await skillsnetwork.prepare_dataset(httpserver.url_for(url), path=path)
+    assert expected_path.exists()
+    expected_path.unlink()