Skip to content

Commit 8722684

Browse files
authored
Boost references speed (fsspec#892)
* Boost references speed by skipping _ls_from_cache * Don't process templates if there are none * Don't make dircache until needed (zarr access doesn't need it!)
1 parent 3cc5b48 commit 8722684

File tree

2 files changed

+12
-10
lines changed

2 files changed

+12
-10
lines changed

fsspec/implementations/github.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -156,15 +156,17 @@ def ls(self, path, detail=False, sha=None, _sha=None, **kwargs):
156156
if r.status_code == 404:
157157
raise FileNotFoundError(path)
158158
r.raise_for_status()
159+
types = {"blob": "file", "tree": "directory"}
159160
out = [
160161
{
161162
"name": path + "/" + f["path"] if path else f["path"],
162163
"mode": f["mode"],
163-
"type": {"blob": "file", "tree": "directory"}[f["type"]],
164+
"type": types[f["type"]],
164165
"size": f.get("size", 0),
165166
"sha": f["sha"],
166167
}
167168
for f in r.json()["tree"]
169+
if f["type"] in types
168170
]
169171
if sha in [self.root, None]:
170172
self.dircache[path] = out

fsspec/implementations/reference.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ def cat_file(self, path, start=None, end=None, **kwargs):
221221
part_or_url, start0, end0 = self._cat_common(path)
222222
if isinstance(part_or_url, bytes):
223223
return part_or_url[start:end]
224+
# TODO: update start0, end0 if start/end given, instead of slicing
224225
return self.fs.cat_file(part_or_url, start=start0, end=end0)[start:end]
225226

226227
def pipe_file(self, path, value, **_):
@@ -279,7 +280,6 @@ def _render_jinja(url):
279280

280281
if self.templates:
281282
self.df["url"] = self.df["url"].map(_render_jinja)
282-
self._dircache_from_items()
283283

284284
def _process_references(self, references, template_overrides=None):
285285
if isinstance(references, (str, bytes)):
@@ -293,7 +293,6 @@ def _process_references(self, references, template_overrides=None):
293293
raise ValueError(f"Unknown reference spec version: {vers}")
294294
# TODO: we make dircache by iterating over all entries, but for Spec >= 1,
295295
# can replace with programmatic. Is it even needed for mapper interface?
296-
self._dircache_from_items()
297296

298297
def _process_references0(self, references):
299298
"""Make reference dict for Spec Version 0"""
@@ -320,7 +319,7 @@ def _render_jinja(u):
320319
if v.startswith("base64:"):
321320
self.references[k] = base64.b64decode(v[7:])
322321
self.references[k] = v
323-
else:
322+
elif self.templates:
324323
u = v[0]
325324
if "{{" in u:
326325
if self.simple_templates:
@@ -332,6 +331,8 @@ def _render_jinja(u):
332331
else:
333332
u = _render_jinja(u)
334333
self.references[k] = [u] if len(v) == 1 else [u, v[1], v[2]]
334+
else:
335+
self.references[k] = v
335336
self.references.update(self._process_gen(references.get("gen", [])))
336337

337338
def _process_templates(self, tmp):
@@ -422,6 +423,8 @@ def open(self, path, mode="rb", block_size=None, cache_options=None, **kwargs):
422423

423424
def ls(self, path, detail=True, **kwargs):
424425
path = self._strip_protocol(path)
426+
if not self.dircache:
427+
self._dircache_from_items()
425428
out = self._ls_from_cache(path)
426429
if out is None:
427430
raise FileNotFoundError
@@ -430,16 +433,13 @@ def ls(self, path, detail=True, **kwargs):
430433
return [o["name"] for o in out]
431434

432435
def exists(self, path, **kwargs): # overwrite auto-sync version
433-
try:
434-
return self._ls_from_cache(path) is not None
435-
except FileNotFoundError:
436-
return False
436+
return self.isdir(path) or self.isfile(path)
437437

438438
def isdir(self, path): # overwrite auto-sync version
439-
return self.exists(path) and self.info(path)["type"] == "directory"
439+
return path in self.dircache
440440

441441
def isfile(self, path): # overwrite auto-sync version
442-
return self.exists(path) and self.info(path)["type"] == "file"
442+
return path in self.references
443443

444444
async def _ls(self, path, detail=True, **kwargs): # calls fast sync code
445445
return self.ls(path, detail, **kwargs)

0 commit comments

Comments
 (0)