Skip to content

Commit 2fbe8de

Browse files
authored
Support cache mapper that is basename plus fixed number of parent directories (#1318)
1 parent a988ce5 commit 2fbe8de

File tree

3 files changed

+141
-19
lines changed

3 files changed

+141
-19
lines changed

fsspec/implementations/cache_mapper.py

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@
22

33
import abc
44
import hashlib
5-
import os
65
from typing import TYPE_CHECKING
76

7+
from fsspec.implementations.local import make_path_posix
8+
89
if TYPE_CHECKING:
910
from typing import Any
1011

@@ -30,14 +31,36 @@ def __hash__(self) -> int:
3031

3132

3233
class BasenameCacheMapper(AbstractCacheMapper):
33-
"""Cache mapper that uses the basename of the remote URL.
34+
"""Cache mapper that uses the basename of the remote URL and a fixed number
35+
of directory levels above this.
3436
35-
Different paths with the same basename will therefore have the same cached
36-
basename.
37+
The default is zero directory levels, meaning different paths with the same
38+
basename will have the same cached basename.
3739
"""
3840

41+
def __init__(self, directory_levels: int = 0):
42+
if directory_levels < 0:
43+
raise ValueError(
44+
"BasenameCacheMapper requires zero or positive directory_levels"
45+
)
46+
self.directory_levels = directory_levels
47+
48+
# Separator for directories when encoded as strings.
49+
self._separator = "_@_"
50+
3951
def __call__(self, path: str) -> str:
40-
return os.path.basename(path)
52+
path = make_path_posix(path)
53+
prefix, *bits = path.rsplit("/", self.directory_levels + 1)
54+
if bits:
55+
return self._separator.join(bits)
56+
else:
57+
return prefix # No separator found, simple filename
58+
59+
def __eq__(self, other: Any) -> bool:
60+
return super().__eq__(other) and self.directory_levels == other.directory_levels
61+
62+
def __hash__(self) -> int:
63+
return super().__hash__() ^ hash(self.directory_levels)
4164

4265

4366
class HashCacheMapper(AbstractCacheMapper):

fsspec/implementations/cached.py

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import tempfile
99
import time
1010
from shutil import rmtree
11-
from typing import Any, ClassVar
11+
from typing import TYPE_CHECKING, Any, Callable, ClassVar
1212

1313
from fsspec import AbstractFileSystem, filesystem
1414
from fsspec.callbacks import _DEFAULT_CALLBACK
@@ -19,6 +19,9 @@
1919
from fsspec.spec import AbstractBufferedFile
2020
from fsspec.utils import infer_compression
2121

22+
if TYPE_CHECKING:
23+
from fsspec.implementations.cache_mapper import AbstractCacheMapper
24+
2225
logger = logging.getLogger("fsspec.cached")
2326

2427

@@ -53,8 +56,9 @@ def __init__(
5356
expiry_time=604800,
5457
target_options=None,
5558
fs=None,
56-
same_names=False,
59+
same_names: bool | None = None,
5760
compression=None,
61+
cache_mapper: AbstractCacheMapper | None = None,
5862
**kwargs,
5963
):
6064
"""
@@ -84,13 +88,19 @@ def __init__(
8488
fs: filesystem instance
8589
The target filesystem to run against. Provide this or ``protocol``.
8690
same_names: bool (optional)
87-
By default, target URLs are hashed, so that files from different backends
88-
with the same basename do not conflict. If this is true, the original
89-
basename is used.
91+
By default, target URLs are hashed using a ``HashCacheMapper`` so
92+
that files from different backends with the same basename do not
93+
conflict. If this argument is ``true``, a ``BasenameCacheMapper``
94+
is used instead. Other cache mapper options are available by using
95+
the ``cache_mapper`` keyword argument. Only one of this and
96+
``cache_mapper`` should be specified.
9097
compression: str (optional)
9198
To decompress on download. Can be 'infer' (guess from the URL name),
9299
one of the entries in ``fsspec.compression.compr``, or None for no
93100
decompression.
101+
cache_mapper: AbstractCacheMapper (optional)
102+
The object use to map from original filenames to cached filenames.
103+
Only one of this and ``same_names`` should be specified.
94104
"""
95105
super().__init__(**kwargs)
96106
if fs is None and target_protocol is None:
@@ -115,7 +125,19 @@ def __init__(
115125
self.check_files = check_files
116126
self.expiry = expiry_time
117127
self.compression = compression
118-
self._mapper = create_cache_mapper(same_names)
128+
129+
if same_names is not None and cache_mapper is not None:
130+
raise ValueError(
131+
"Cannot specify both same_names and cache_mapper in "
132+
"CachingFileSystem.__init__"
133+
)
134+
if cache_mapper is not None:
135+
self._mapper = cache_mapper
136+
else:
137+
self._mapper = create_cache_mapper(
138+
same_names if same_names is not None else False
139+
)
140+
119141
self.target_protocol = (
120142
target_protocol
121143
if isinstance(target_protocol, str)
@@ -128,7 +150,7 @@ def _strip_protocol(path):
128150
# acts as a method, since each instance has a difference target
129151
return self.fs._strip_protocol(type(self)._strip_protocol(path))
130152

131-
self._strip_protocol = _strip_protocol
153+
self._strip_protocol: Callable = _strip_protocol
132154

133155
def _mkcache(self):
134156
os.makedirs(self.storage[-1], exist_ok=True)

fsspec/implementations/tests/test_cached.py

Lines changed: 84 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,11 @@
88
import fsspec
99
from fsspec.compression import compr
1010
from fsspec.exceptions import BlocksizeMismatchError
11-
from fsspec.implementations.cache_mapper import create_cache_mapper
11+
from fsspec.implementations.cache_mapper import (
12+
BasenameCacheMapper,
13+
HashCacheMapper,
14+
create_cache_mapper,
15+
)
1216
from fsspec.implementations.cached import CachingFileSystem, LocalTempFile
1317
from fsspec.implementations.local import make_path_posix
1418

@@ -36,10 +40,20 @@ def local_filecache():
3640

3741
def test_mapper():
3842
mapper0 = create_cache_mapper(True)
43+
assert mapper0("somefile") == "somefile"
44+
assert mapper0("/somefile") == "somefile"
3945
assert mapper0("/somedir/somefile") == "somefile"
4046
assert mapper0("/otherdir/somefile") == "somefile"
4147

4248
mapper1 = create_cache_mapper(False)
49+
assert (
50+
mapper1("somefile")
51+
== "dd00b9487898b02555b6a2d90a070586d63f93e80c70aaa60c992fa9e81a72fe"
52+
)
53+
assert (
54+
mapper1("/somefile")
55+
== "884c07bc2efe65c60fb9d280a620e7f180488718fb5d97736521b7f9cf5c8b37"
56+
)
4357
assert (
4458
mapper1("/somedir/somefile")
4559
== "67a6956e5a5f95231263f03758c1fd9254fdb1c564d311674cec56b0372d2056"
@@ -57,9 +71,47 @@ def test_mapper():
5771
assert hash(create_cache_mapper(True)) == hash(mapper0)
5872
assert hash(create_cache_mapper(False)) == hash(mapper1)
5973

60-
61-
@pytest.mark.parametrize("same_names", [False, True])
62-
def test_metadata(tmpdir, same_names):
74+
with pytest.raises(
75+
ValueError,
76+
match="BasenameCacheMapper requires zero or positive directory_levels",
77+
):
78+
BasenameCacheMapper(-1)
79+
80+
mapper2 = BasenameCacheMapper(1)
81+
assert mapper2("/somefile") == "somefile"
82+
assert mapper2("/somedir/somefile") == "somedir_@_somefile"
83+
assert mapper2("/otherdir/somefile") == "otherdir_@_somefile"
84+
assert mapper2("/dir1/dir2/dir3/somefile") == "dir3_@_somefile"
85+
86+
assert mapper2 != mapper0
87+
assert mapper2 != mapper1
88+
assert BasenameCacheMapper(1) == mapper2
89+
90+
assert hash(mapper2) != hash(mapper0)
91+
assert hash(mapper2) != hash(mapper1)
92+
assert hash(BasenameCacheMapper(1)) == hash(mapper2)
93+
94+
mapper3 = BasenameCacheMapper(2)
95+
assert mapper3("/somefile") == "somefile"
96+
assert mapper3("/somedir/somefile") == "somedir_@_somefile"
97+
assert mapper3("/otherdir/somefile") == "otherdir_@_somefile"
98+
assert mapper3("/dir1/dir2/dir3/somefile") == "dir2_@_dir3_@_somefile"
99+
100+
assert mapper3 != mapper0
101+
assert mapper3 != mapper1
102+
assert mapper3 != mapper2
103+
assert BasenameCacheMapper(2) == mapper3
104+
105+
assert hash(mapper3) != hash(mapper0)
106+
assert hash(mapper3) != hash(mapper1)
107+
assert hash(mapper3) != hash(mapper2)
108+
assert hash(BasenameCacheMapper(2)) == hash(mapper3)
109+
110+
111+
@pytest.mark.parametrize(
112+
"cache_mapper", [BasenameCacheMapper(), BasenameCacheMapper(1), HashCacheMapper()]
113+
)
114+
def test_metadata(tmpdir, cache_mapper):
63115
source = os.path.join(tmpdir, "source")
64116
afile = os.path.join(source, "afile")
65117
os.mkdir(source)
@@ -69,7 +121,7 @@ def test_metadata(tmpdir, same_names):
69121
"filecache",
70122
target_protocol="file",
71123
cache_storage=os.path.join(tmpdir, "cache"),
72-
same_names=same_names,
124+
cache_mapper=cache_mapper,
73125
)
74126

75127
with fs.open(afile, "rb") as f:
@@ -85,8 +137,33 @@ def test_metadata(tmpdir, same_names):
85137

86138
assert detail["original"] == afile_posix
87139
assert detail["fn"] == fs._mapper(afile_posix)
88-
if same_names:
89-
assert detail["fn"] == "afile"
140+
141+
if isinstance(cache_mapper, BasenameCacheMapper):
142+
if cache_mapper.directory_levels == 0:
143+
assert detail["fn"] == "afile"
144+
else:
145+
assert detail["fn"] == "source_@_afile"
146+
147+
148+
def test_constructor_kwargs(tmpdir):
149+
fs = fsspec.filesystem("filecache", target_protocol="file", same_names=True)
150+
assert isinstance(fs._mapper, BasenameCacheMapper)
151+
152+
fs = fsspec.filesystem("filecache", target_protocol="file", same_names=False)
153+
assert isinstance(fs._mapper, HashCacheMapper)
154+
155+
fs = fsspec.filesystem("filecache", target_protocol="file")
156+
assert isinstance(fs._mapper, HashCacheMapper)
157+
158+
with pytest.raises(
159+
ValueError, match="Cannot specify both same_names and cache_mapper"
160+
):
161+
fs = fsspec.filesystem(
162+
"filecache",
163+
target_protocol="file",
164+
cache_mapper=HashCacheMapper(),
165+
same_names=True,
166+
)
90167

91168

92169
def test_idempotent():

0 commit comments

Comments
 (0)