Skip to content

Commit e481138

Browse files
committed
feat: gitpathspec module for handling Git's pathspecs
Besides the basic type provided here, the key feature is the translation of pathspecs into the scope of a subdirectory. This is the foundational support for implementations that focus on submodule-recursion combined with pathspecs. Git does not generally provide this support in its commands.
1 parent 40543a4 commit e481138

File tree

6 files changed

+925
-1
lines changed

6 files changed

+925
-1
lines changed

datasalad/gitpathspec/__init__.py

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
"""Handling of Git's pathspecs with subdirectory mangling support
2+
3+
This functionality can be used to add support for pathspecs to implementations
4+
that rely on Git commands that do not support submodule recursion directly.
5+
6+
.. currentmodule:: datasalad.gitpathspec
7+
.. autosummary::
8+
:toctree: generated
9+
10+
GitPathSpec
11+
GitPathSpecs
12+
"""
13+
14+
__all__ = ['GitPathSpec', 'GitPathSpecs']
15+
16+
from .pathspec import GitPathSpec
17+
from .pathspecs import GitPathSpecs

datasalad/gitpathspec/pathspec.py

+320
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,320 @@
1+
#
2+
# Intentionally written without importing datalad code
3+
#
4+
from __future__ import annotations
5+
6+
import posixpath
7+
from dataclasses import dataclass
8+
from fnmatch import fnmatch
9+
from typing import Generator
10+
11+
12+
@dataclass(frozen=True)
13+
class GitPathSpec:
14+
"""Support class for patterns used to limit paths in Git commands
15+
16+
From the Git documentation:
17+
18+
Pathspecs are used on the command line of "git ls-files", "git ls-tree",
19+
"git add", "git grep", "git diff", "git checkout", and many other
20+
commands to limit the scope of operations to some subset of the tree
21+
or working tree.
22+
23+
Apart from providing a dedicated type for a pathspec, the main purpose
24+
of this functionality is to take a pathspec that is valid in the context
25+
of one (top-level) repository, and translate it such that the set of
26+
pathspecs given to the same command running on/in a submodule/subdirectory
27+
gives the same results, as if the initial top-level invocation reported
28+
them (if it even could). See the ``for_subdir()`` method for more.
29+
30+
>>> # simple stripping of leading directory
31+
>>> ps = GitPathSpec.from_pathspec_str('dir/*.jpg')
32+
>>> [str(i) for i in ps.for_subdir('dir')]
33+
['*.jpg']
34+
>>> # match against magic pathspecs
35+
>>> ps = GitPathSpec.from_pathspec_str(':(glob)**r/*.jpg')
36+
>>> # longest and shortest match are produced
37+
>>> [str(i) for i in ps.for_subdir('dir')]
38+
[':(glob)**r/*.jpg', ':(glob)*.jpg']
39+
>>> [str(i) for i in ps.for_subdir('root/some/dir')]
40+
[':(glob)**r/*.jpg', ':(glob)*.jpg']
41+
>>> # support for special 'no-pathspec' pathspec
42+
>>> ps = GitPathSpec.from_pathspec_str(':')
43+
>>> ps.is_nopathspecs
44+
True
45+
46+
.. seealso::
47+
48+
- Entry in the Git glossary:
49+
https://git-scm.com/docs/gitglossary#Documentation/gitglossary.txt-aiddefpathspecapathspec
50+
- Informative, more elaborate description of pathspecs:
51+
https://css-tricks.com/git-pathspecs-and-how-to-use-them/
52+
"""
53+
54+
# TODO: think about adding support for another magic that represents
55+
# the root of a repository hierarchy (amending 'top', which is
56+
# the root of the working tree -- but presumably for a single repository
57+
spectypes: tuple[str, ...]
58+
"""Long-form pathspec type identifiers"""
59+
dirprefix: str
60+
"""Directory prefix (pathspec up to the last slash) limiting the scope"""
61+
pattern: str | None
62+
"""Pattern to match paths against using ``fnmatch``"""
63+
64+
@property
65+
def is_nopathspecs(self) -> bool:
66+
"""Whether this pathspec is the "no pathspecs" pathspec, AKA ``':'``"""
67+
return not self.spectypes and not self.dirprefix and not self.pattern
68+
69+
def __str__(self) -> str:
70+
"""Generate normalized (long-form) pathspec"""
71+
if self.is_nopathspecs:
72+
return ':'
73+
ps = ''
74+
if self.spectypes:
75+
ps += ':('
76+
ps += ','.join(self.spectypes)
77+
ps += ')'
78+
ps += self.get_joined_pattern()
79+
return ps
80+
81+
def get_joined_pattern(self):
82+
return (
83+
f'{self.dirprefix if self.dirprefix else ""}'
84+
f'{"/" if self.dirprefix else ""}'
85+
f'{self.pattern if self.pattern else ""}'
86+
)
87+
88+
def for_subdir(self, subdir: str) -> list[GitPathSpec]:
89+
"""Translate a pathspec into the scope of a subdirectory.
90+
91+
The processing implemented here is purely lexical. This means that it
92+
works without matching against actual file system (or Git tree)
93+
content. Consequently, to some degree, overly broad results are
94+
produced, but at the same time use cases are supported where there
95+
is nothing (yet) to match against (e.g., a not-yet-cloned submodule).
96+
97+
A pathspec with a ``top`` magic is produced unmodified, as there are
98+
defined relative to the root of a repository, not relative to a base
99+
directory. As a consequence, such pathspecs will automatically
100+
refer to a submodule root when the target directory is contained in
101+
one.
102+
103+
Parameters
104+
----------
105+
subdir: str
106+
Relative path in POSIX notation
107+
108+
Returns
109+
-------
110+
list
111+
When an empty list is returned, this indicates that the pathsspec
112+
cannot be translated to the given ``subdir``, because it does
113+
not match the ``subdir`` itself. If a pathspec translates to
114+
"no pathspecs" (``':'``), a list with a dedicated ':' pathspec is
115+
returned.
116+
"""
117+
# special case of a non-translation (pretty much only here to
118+
# make some test implementations simpler
119+
if not subdir:
120+
return [self]
121+
122+
return list(yield_subdir_match_remainder_pathspecs(subdir, self))
123+
124+
@classmethod
125+
def from_pathspec_str(
126+
cls,
127+
pathspec: str,
128+
) -> GitPathSpec:
129+
"""Parse a string-form pathspec into types, prefix, and pattern"""
130+
spectypes = []
131+
dirprefix = None
132+
pattern = None
133+
134+
if pathspec == ':':
135+
# shortcut for the special no-path-spec pathspec
136+
return GitPathSpec((), '', None)
137+
138+
if pathspec.startswith(':('):
139+
# long-form magic
140+
magic, pattern = pathspec[2:].split(')', maxsplit=1)
141+
spectypes = magic.split(',')
142+
elif pathspec.startswith(':'):
143+
# short-form magic
144+
magic_signatures = {
145+
'/': 'top',
146+
'!': 'exclude',
147+
'^': 'exclude',
148+
':': None,
149+
}
150+
pattern = pathspec[1:]
151+
spectypes = []
152+
for i in range(1, len(pathspec)):
153+
sig = magic_signatures.get(pathspec[i])
154+
if sig is None:
155+
pattern = pathspec[i:]
156+
break
157+
spectypes.append(sig)
158+
else:
159+
pattern = pathspec
160+
161+
# raise when glob and literal magic markers are present
162+
# simultaneously
163+
if 'glob' in spectypes and 'literal' in spectypes:
164+
msg = "'glob' magic is incompatible with 'literal' magic"
165+
raise ValueError(msg)
166+
167+
# split off dirprefix
168+
dirprefix, pattern = _split_prefix_pattern(pattern)
169+
170+
return cls(
171+
spectypes=tuple(spectypes),
172+
dirprefix=dirprefix,
173+
pattern=pattern,
174+
)
175+
176+
177+
def _split_prefix_pattern(pathspec):
178+
# > the pathspec up to the last slash represents a directory prefix.
179+
# > The scope of that pathspec is limited to that subtree.
180+
try:
181+
last_slash_idx = pathspec[::-1].index('/')
182+
except ValueError:
183+
# everything is the pattern
184+
dirprefix = None
185+
pattern = pathspec
186+
else:
187+
dirprefix = pathspec[: -last_slash_idx - 1]
188+
pattern = pathspec[-last_slash_idx:] if last_slash_idx > 0 else None
189+
return dirprefix, pattern
190+
191+
192+
def yield_subdir_match_remainder_pathspecs(
193+
subdir: str,
194+
pathspec: GitPathSpec,
195+
) -> Generator[GitPathSpec, None, None]:
196+
"""Translate a pathspec into a set of possible subdirectory pathspecs
197+
198+
The processing implemented here is purely lexical. This means that it
199+
works without matching against actual file system (or Git tree) content.
200+
This means that it yields, to some degree, overly broad results, but also
201+
that it works in cases where there is nothing (yet) to match against.
202+
For example, a not-yet-cloned submodule.
203+
204+
This function does not perform any validatity checking of pathspecs. Only
205+
valid pathspecs and well-formed paths are supported.
206+
207+
A pathspec with the ``top`` magic is returned immediately and as-is. These
208+
pathspecs have an absolute reference and do not require a translation into
209+
a subdirectory namespace.
210+
211+
Parameters
212+
----------
213+
subdir: str
214+
POSIX-notation relative path of a subdirectory. The reference directory
215+
match be the same as that of the pathspec to be translated.
216+
pathspec: GitPathSpec
217+
To-be-translated pathspec
218+
219+
Yields
220+
------
221+
GitPathSpec
222+
Any number of pathspecs that an input pathspec decomposed into upon
223+
translation into the namespace of a subdirectory.
224+
"""
225+
if 'top' in pathspec.spectypes or pathspec.is_nopathspecs:
226+
# pathspec with an absolute reference, or "no pathspecs"
227+
# no translation needed
228+
yield pathspec
229+
return
230+
231+
# add a trailing directory separator to prevent undesired
232+
# matches of partial directory names
233+
subdir = subdir if subdir.endswith('/') else f'{subdir}/'
234+
tp = pathspec.get_joined_pattern()
235+
236+
if 'icase' in pathspec.spectypes:
237+
subdir = subdir.casefold()
238+
tp = tp.casefold()
239+
240+
# literal pathspecs
241+
if 'literal' in pathspec.spectypes:
242+
# append a trailing slash to allow for full matches
243+
tp_endslash = f'{tp}/'
244+
if not tp_endslash.startswith(subdir):
245+
# no match
246+
# BUT
247+
# we might have a multi-level subdir, and we might match an
248+
# intermediate subdir and could still yield a 'no pathspec'
249+
# result
250+
while subdir := posixpath.split(subdir)[0]:
251+
if tp_endslash.startswith(subdir):
252+
yield GitPathSpec.from_pathspec_str(':')
253+
return
254+
return
255+
256+
remainder = tp[len(subdir) :]
257+
if not remainder:
258+
# full match
259+
yield GitPathSpec.from_pathspec_str(':')
260+
else:
261+
yield GitPathSpec(pathspec.spectypes, *_split_prefix_pattern(remainder))
262+
return
263+
264+
# tokenize the testpattern using the wildcard that also matches
265+
# directories
266+
token_delim = '**' if 'glob' in pathspec.spectypes else '*'
267+
tp_chunks = tp.split(token_delim)
268+
prefix_match = ''
269+
yielded = set()
270+
for i, chunk in enumerate(tp_chunks):
271+
last_chunk = i + 1 == len(tp_chunks)
272+
if last_chunk:
273+
trymatch = f'{prefix_match}{chunk}{"" if chunk.endswith("/") else "/"}'
274+
else:
275+
trymatch = f'{prefix_match}{chunk}*'
276+
if not fnmatch(subdir, f'{trymatch}'):
277+
# each chunk needs match in order, first non-match ends the
278+
# algorithm
279+
# BUT
280+
# we have an (initial) chunk that points already
281+
# inside the target subdir
282+
submatch = trymatch
283+
while submatch := posixpath.split(submatch)[0]:
284+
if fnmatch(f'{subdir}', f'{submatch}/'):
285+
ps = GitPathSpec(
286+
pathspec.spectypes,
287+
*_split_prefix_pattern(
288+
# +1 for trailing slash
289+
tp[len(submatch) + 1 :]
290+
),
291+
)
292+
if ps not in yielded:
293+
yield ps
294+
return
295+
# OR
296+
# we might have a multi-level subdir, and we might match an
297+
# intermediate subdir and could still yield a 'no pathspec'
298+
# result
299+
while subdir := posixpath.split(subdir)[0]:
300+
if fnmatch(f'{subdir}/', trymatch):
301+
yield GitPathSpec.from_pathspec_str(':')
302+
return
303+
return
304+
305+
remainder = tp_chunks[i + 1 :]
306+
if all(not c for c in remainder):
307+
# direct hit, no pathspecs after translation
308+
yield GitPathSpec.from_pathspec_str(':')
309+
return
310+
else:
311+
ps = GitPathSpec(
312+
pathspec.spectypes,
313+
*_split_prefix_pattern(
314+
f'{token_delim}{token_delim.join(remainder)}',
315+
),
316+
)
317+
yield ps
318+
yielded.add(ps)
319+
# extend prefix for the next round
320+
prefix_match = trymatch

0 commit comments

Comments
 (0)