Skip to content

Commit b1cc072

Browse files
committed
feat: gitpathspec module for handling Git's pathspecs
Besides the basic type provided here, the key feature is the translation of pathspecs into the scope of a subdirectory. This is the foundational support for implementations that focus on submodule-recursion combined with pathspecs. Git does not generally provide this support in its commands.
1 parent 3862c67 commit b1cc072

File tree

6 files changed

+934
-1
lines changed

6 files changed

+934
-1
lines changed

datasalad/gitpathspec/__init__.py

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
"""Handling of Git's pathspecs with subdirectory mangling support
2+
3+
This functionality can be used to add support for pathspecs to implementations
4+
that rely on Git commands that do not support submodule recursion directly.
5+
6+
.. currentmodule:: datasalad.gitpathspec
7+
.. autosummary::
8+
:toctree: generated
9+
10+
GitPathSpec
11+
GitPathSpecs
12+
"""
13+
14+
__all__ = ['GitPathSpec', 'GitPathSpecs']
15+
16+
from .pathspec import GitPathSpec
17+
from .pathspecs import GitPathSpecs

datasalad/gitpathspec/pathspec.py

+327
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,327 @@
1+
#
2+
# Intentionally written without importing datalad code
3+
#
4+
from __future__ import annotations
5+
6+
import posixpath
7+
from dataclasses import dataclass
8+
from fnmatch import fnmatch
9+
from typing import Generator
10+
11+
12+
@dataclass(frozen=True)
13+
class GitPathSpec:
14+
"""Support class for patterns used to limit paths in Git commands
15+
16+
From the Git documentation:
17+
18+
Pathspecs are used on the command line of "git ls-files", "git ls-tree",
19+
"git add", "git grep", "git diff", "git checkout", and many other
20+
commands to limit the scope of operations to some subset of the tree
21+
or working tree.
22+
23+
Apart from providing a dedicated type for a pathspec, the main purpose
24+
of this functionality is to take a pathspec that is valid in the context
25+
of one (top-level) repository, and translate it such that the set of
26+
pathspecs given to the same command running on/in a submodule/subdirectory
27+
gives the same results, as if the initial top-level invocation reported
28+
them (if it even could). See the ``for_subdir()`` method for more.
29+
30+
>>> # simple stripping of leading directory
31+
>>> ps = GitPathSpec.from_pathspec_str('dir/*.jpg')
32+
>>> [str(i) for i in ps.for_subdir('dir')]
33+
['*.jpg']
34+
>>> # match against magic pathspecs
35+
>>> ps = GitPathSpec.from_pathspec_str(':(glob)**r/*.jpg')
36+
>>> # longest and shortest match are produced
37+
>>> [str(i) for i in ps.for_subdir('dir')]
38+
[':(glob)**r/*.jpg', ':(glob)*.jpg']
39+
>>> [str(i) for i in ps.for_subdir('root/some/dir')]
40+
[':(glob)**r/*.jpg', ':(glob)*.jpg']
41+
>>> # support for special 'no-pathspec' pathspec
42+
>>> ps = GitPathSpec.from_pathspec_str(':')
43+
>>> ps.is_nopathspecs
44+
True
45+
46+
.. seealso::
47+
48+
- Entry in the Git glossary:
49+
https://git-scm.com/docs/gitglossary#Documentation/gitglossary.txt-aiddefpathspecapathspec
50+
- Informative, more elaborate description of pathspecs:
51+
https://css-tricks.com/git-pathspecs-and-how-to-use-them/
52+
"""
53+
54+
# TODO: think about adding support for another magic that represents
55+
# the root of a repository hierarchy (amending 'top', which is
56+
# the root of the working tree -- but presumably for a single repository
57+
spectypes: tuple[str, ...]
58+
"""Long-form pathspec type identifiers"""
59+
dirprefix: str | None
60+
"""Directory prefix (pathspec up to the last slash) limiting the scope"""
61+
pattern: str | None
62+
"""Pattern to match paths against using ``fnmatch``"""
63+
64+
@property
65+
def is_nopathspecs(self) -> bool:
66+
"""Whether this pathspec is the "no pathspecs" pathspec, AKA ``':'``"""
67+
return not self.spectypes and not self.dirprefix and not self.pattern
68+
69+
def __str__(self) -> str:
70+
"""Generate normalized (long-form) pathspec"""
71+
if self.is_nopathspecs:
72+
return ':'
73+
ps = ''
74+
if self.spectypes:
75+
ps += ':('
76+
ps += ','.join(self.spectypes)
77+
ps += ')'
78+
ps += self.get_joined_pattern()
79+
return ps
80+
81+
def get_joined_pattern(self):
82+
return (
83+
f'{self.dirprefix if self.dirprefix else ""}'
84+
f'{"/" if self.dirprefix else ""}'
85+
f'{self.pattern if self.pattern else ""}'
86+
)
87+
88+
def for_subdir(self, subdir: str) -> list[GitPathSpec]:
89+
"""Translate a pathspec into the scope of a subdirectory.
90+
91+
The processing implemented here is purely lexical. This means that it
92+
works without matching against actual file system (or Git tree)
93+
content. Consequently, to some degree, overly broad results are
94+
produced, but at the same time use cases are supported where there
95+
is nothing (yet) to match against (e.g., a not-yet-cloned submodule).
96+
97+
A pathspec with a ``top`` magic is produced unmodified, as there are
98+
defined relative to the root of a repository, not relative to a base
99+
directory. As a consequence, such pathspecs will automatically
100+
refer to a submodule root when the target directory is contained in
101+
one.
102+
103+
Parameters
104+
----------
105+
subdir: str
106+
Relative path in POSIX notation
107+
108+
Returns
109+
-------
110+
list
111+
When an empty list is returned, this indicates that the pathsspec
112+
cannot be translated to the given ``subdir``, because it does
113+
not match the ``subdir`` itself. If a pathspec translates to
114+
"no pathspecs" (``':'``), a list with a dedicated ':' pathspec is
115+
returned.
116+
"""
117+
# special case of a non-translation (pretty much only here to
118+
# make some test implementations simpler
119+
if not subdir:
120+
return [self]
121+
122+
return list(yield_subdir_match_remainder_pathspecs(subdir, self))
123+
124+
@classmethod
125+
def from_pathspec_str(
126+
cls,
127+
pathspec: str,
128+
) -> GitPathSpec:
129+
"""Parse a string-form pathspec into types, prefix, and pattern"""
130+
spectypes: list[str] = []
131+
dirprefix = None
132+
pattern = None
133+
134+
if pathspec == ':':
135+
# shortcut for the special no-path-spec pathspec
136+
return GitPathSpec((), '', None)
137+
138+
if pathspec.startswith(':('):
139+
# long-form magic
140+
magic, pattern = pathspec[2:].split(')', maxsplit=1)
141+
spectypes = magic.split(',')
142+
elif pathspec.startswith(':'):
143+
# short-form magic
144+
spectypes, pattern = _pathspec_from_shortform(pathspec)
145+
else:
146+
pattern = pathspec
147+
148+
# raise when glob and literal magic markers are present
149+
# simultaneously
150+
if 'glob' in spectypes and 'literal' in spectypes:
151+
msg = "'glob' magic is incompatible with 'literal' magic"
152+
raise ValueError(msg)
153+
154+
# split off dirprefix
155+
dirprefix, pattern = _split_prefix_pattern(pattern)
156+
157+
return cls(
158+
spectypes=tuple(spectypes),
159+
dirprefix=dirprefix,
160+
pattern=pattern,
161+
)
162+
163+
164+
def _pathspec_from_shortform(spec: str) -> tuple[list[str], str]:
165+
# short-form magic
166+
magic_signatures = {
167+
'/': 'top',
168+
'!': 'exclude',
169+
'^': 'exclude',
170+
':': None,
171+
}
172+
pattern = spec[1:]
173+
spectypes: list[str] = []
174+
for i in range(1, len(spec)):
175+
sig = magic_signatures.get(spec[i])
176+
if sig is None:
177+
return (spectypes, spec[i:])
178+
spectypes.append(sig)
179+
return (spectypes, pattern)
180+
181+
182+
def _split_prefix_pattern(pathspec: str) -> tuple[str | None, str | None]:
183+
# > the pathspec up to the last slash represents a directory prefix.
184+
# > The scope of that pathspec is limited to that subtree.
185+
pattern: str | None = None
186+
try:
187+
last_slash_idx = pathspec[::-1].index('/')
188+
except ValueError:
189+
# everything is the pattern
190+
dirprefix = None
191+
pattern = pathspec
192+
else:
193+
dirprefix = pathspec[: -last_slash_idx - 1]
194+
pattern = pathspec[-last_slash_idx:] if last_slash_idx > 0 else None
195+
return dirprefix, pattern
196+
197+
198+
def yield_subdir_match_remainder_pathspecs(
199+
subdir: str,
200+
pathspec: GitPathSpec,
201+
) -> Generator[GitPathSpec, None, None]:
202+
"""Translate a pathspec into a set of possible subdirectory pathspecs
203+
204+
The processing implemented here is purely lexical. This means that it
205+
works without matching against actual file system (or Git tree) content.
206+
This means that it yields, to some degree, overly broad results, but also
207+
that it works in cases where there is nothing (yet) to match against.
208+
For example, a not-yet-cloned submodule.
209+
210+
This function does not perform any validatity checking of pathspecs. Only
211+
valid pathspecs and well-formed paths are supported.
212+
213+
A pathspec with the ``top`` magic is returned immediately and as-is. These
214+
pathspecs have an absolute reference and do not require a translation into
215+
a subdirectory namespace.
216+
217+
Parameters
218+
----------
219+
subdir: str
220+
POSIX-notation relative path of a subdirectory. The reference directory
221+
match be the same as that of the pathspec to be translated.
222+
pathspec: GitPathSpec
223+
To-be-translated pathspec
224+
225+
Yields
226+
------
227+
GitPathSpec
228+
Any number of pathspecs that an input pathspec decomposed into upon
229+
translation into the namespace of a subdirectory.
230+
"""
231+
if 'top' in pathspec.spectypes or pathspec.is_nopathspecs:
232+
# pathspec with an absolute reference, or "no pathspecs"
233+
# no translation needed
234+
yield pathspec
235+
return
236+
237+
# add a trailing directory separator to prevent undesired
238+
# matches of partial directory names
239+
subdir = subdir if subdir.endswith('/') else f'{subdir}/'
240+
tp = pathspec.get_joined_pattern()
241+
242+
if 'icase' in pathspec.spectypes:
243+
subdir = subdir.casefold()
244+
tp = tp.casefold()
245+
246+
# literal pathspecs
247+
if 'literal' in pathspec.spectypes:
248+
# append a trailing slash to allow for full matches
249+
tp_endslash = f'{tp}/'
250+
if not tp_endslash.startswith(subdir):
251+
# no match
252+
# BUT
253+
# we might have a multi-level subdir, and we might match an
254+
# intermediate subdir and could still yield a 'no pathspec'
255+
# result
256+
while subdir := posixpath.split(subdir)[0]:
257+
if tp_endslash.startswith(subdir):
258+
yield GitPathSpec.from_pathspec_str(':')
259+
return
260+
return
261+
262+
remainder = tp[len(subdir) :]
263+
if not remainder:
264+
# full match
265+
yield GitPathSpec.from_pathspec_str(':')
266+
else:
267+
yield GitPathSpec(pathspec.spectypes, *_split_prefix_pattern(remainder))
268+
return
269+
270+
# tokenize the testpattern using the wildcard that also matches
271+
# directories
272+
token_delim = '**' if 'glob' in pathspec.spectypes else '*'
273+
tp_chunks = tp.split(token_delim)
274+
prefix_match = ''
275+
yielded = set()
276+
for i, chunk in enumerate(tp_chunks):
277+
last_chunk = i + 1 == len(tp_chunks)
278+
if last_chunk:
279+
trymatch = f'{prefix_match}{chunk}{"" if chunk.endswith("/") else "/"}'
280+
else:
281+
trymatch = f'{prefix_match}{chunk}*'
282+
if not fnmatch(subdir, f'{trymatch}'):
283+
# each chunk needs match in order, first non-match ends the
284+
# algorithm
285+
# BUT
286+
# we have an (initial) chunk that points already
287+
# inside the target subdir
288+
submatch = trymatch
289+
while submatch := posixpath.split(submatch)[0]:
290+
if not fnmatch(f'{subdir}', f'{submatch}/'):
291+
continue
292+
ps = GitPathSpec(
293+
pathspec.spectypes,
294+
*_split_prefix_pattern(
295+
# +1 for trailing slash
296+
tp[len(submatch) + 1 :]
297+
),
298+
)
299+
if ps not in yielded:
300+
yield ps
301+
return
302+
# OR
303+
# we might have a multi-level subdir, and we might match an
304+
# intermediate subdir and could still yield a 'no pathspec'
305+
# result
306+
while subdir := posixpath.split(subdir)[0]:
307+
if fnmatch(f'{subdir}/', trymatch):
308+
yield GitPathSpec.from_pathspec_str(':')
309+
return
310+
return
311+
312+
remainder = tp_chunks[i + 1 :]
313+
if all(not c for c in remainder):
314+
# direct hit, no pathspecs after translation
315+
yield GitPathSpec.from_pathspec_str(':')
316+
return
317+
else:
318+
ps = GitPathSpec(
319+
pathspec.spectypes,
320+
*_split_prefix_pattern(
321+
f'{token_delim}{token_delim.join(remainder)}',
322+
),
323+
)
324+
yield ps
325+
yielded.add(ps)
326+
# extend prefix for the next round
327+
prefix_match = trymatch

0 commit comments

Comments
 (0)