add instruction on how to get HF dataset URI

deependujha · deependujha · commit 58a077fa3b80 · 2025-02-08T15:58:25.000+05:30
diff --git a/README.md b/README.md
@@ -243,6 +243,15 @@ dataset = StreamingDataset('s3://my-bucket/my-data', cache_dir="/path/to/cache")
 
 To use your favorite  Hugging Face dataset with LitData, simply pass its URL to `StreamingDataset`.
 
+<details>
+  <summary>How to get HF dataset URI?</summary>
+
+- To get the HF dataset URI, `HF: use dataset -> polars -> HF_URI without filename`.
+- For `hf://datasets/open-thoughts/OpenThoughts-114k/data/train-*.parquet`: remove `train-*.parquet`.
+- Use **`hf://datasets/open-thoughts/OpenThoughts-114k/data`**.
+
+</details>
+
 ```python
 import litdata as ld
 
diff --git a/src/litdata/streaming/downloader.py b/src/litdata/streaming/downloader.py
@@ -205,10 +205,8 @@ def download_file(self, remote_filepath: str, local_filepath: str) -> None:
             temp_path = local_filepath + ".tmp"  # Avoid partial writes
             try:
                 with self.fs.open(remote_filepath, "rb") as cloud_file, open(temp_path, "wb") as local_file:
-                    data = cloud_file.read()
-                    if isinstance(data, str):
-                        raise ValueError(f"Expected parquet data in bytes format. But found str. {remote_filepath}")
-                    local_file.write(data)
+                    for chunk in iter(lambda: cloud_file.read(4096), b""):  # Stream in 4KB chunks local_file.
+                        local_file.write(chunk)
 
                 os.rename(temp_path, local_filepath)  # Atomic move after successful write