Skip to content

tar index as json file #1807

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
51 changes: 34 additions & 17 deletions fsspec/implementations/tar.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import json
import logging
import pathlib
import tarfile

import fsspec
Expand Down Expand Up @@ -84,21 +86,39 @@ def __init__(
self.tar = tarfile.TarFile(fileobj=self.fo)
self.dir_cache = None

self.index_store = index_store
if isinstance(index_store, (str, pathlib.Path)):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's reasonable to only allow str here. That would allow for the index to be stored anywhere (in another storage backend), whereas Path is local specific. If you use fsspec.open(), then you could include anything that it can handle.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like that idea. Does fsspec have some way to check if a file exists, akin to pathlib.Path(...).exists()? I could do try/except and catch for a FileNotFoundError, unless there is already a way to check that.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be two-step, create a filesystem instance (fsspec.url_to_fs), and use .exists on that - so probably fine to use try/except here. We have discussed elsewhere whether we should have top-level operations like exists(url) that dispatch to the implementations like fsspec.open, but nothing has been done yet (cf fsspec.generic)

self.index_store = pathlib.Path(index_store)
elif bool(index_store) is True:
# TODO: How to handle a hashed filename from FileCache?
self.index_store = pathlib.Path(f"{name}.index.json")
else:
self.index_store = index_store
self.index = None
self._index()

def _index(self):
# TODO: load and set saved index, if exists
out = {}
for ti in self.tar:
info = ti.get_info()
info["type"] = typemap.get(info["type"], "file")
name = ti.get_info()["name"].rstrip("/")
out[name] = (info, ti.offset_data)

self.index = out
# TODO: save index to self.index_store here, if set
if self.index_store is not None and self.index_store.exists():
# NOTE(PG): Not sure if JSON is the best way to go here, but it's
# simple and human-readable.
logger.debug(f"Reloading from {self.index_store}")
with self.index_store.open("r") as f:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fsspec.open(self.index_store)

self.index = json.load(f)
else:
logger.debug(f"Populating {self.index_store}")
out = {}
for ti in self.tar:
info = ti.get_info()
info["type"] = typemap.get(info["type"], "file")
info["name"] = name = info["name"].rstrip("/")
out[name] = (info, ti.offset_data)

self.index = out
if self.index_store is not None:
with self.index_store.open("w") as f:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fsspec.open(self.index_store, "wt")

try:
json.dump(out, f)
except Exception as e:
logger.warning(f"Failed to write index: {e}")

def _get_dirs(self):
if self.dir_cache is not None:
Expand All @@ -107,13 +127,10 @@ def _get_dirs(self):
# This enables ls to get directories as children as well as files
self.dir_cache = {
dirname: {"name": dirname, "size": 0, "type": "directory"}
for dirname in self._all_dirnames(self.tar.getnames())
for dirname in self._all_dirnames(self.index.keys())
}
for member in self.tar.getmembers():
info = member.get_info()
info["name"] = info["name"].rstrip("/")
info["type"] = typemap.get(info["type"], "file")
self.dir_cache[info["name"]] = info
for name, (info, _) in self.index.items():
self.dir_cache[name] = info

def _open(self, path, mode="rb", **kwargs):
if mode != "rb":
Expand Down
Loading