|
| 1 | +# |
| 2 | +# Intentionally written without importing datalad code |
| 3 | +# |
| 4 | +from __future__ import annotations |
| 5 | + |
| 6 | +import posixpath |
| 7 | +from dataclasses import dataclass |
| 8 | +from fnmatch import fnmatch |
| 9 | +from typing import Generator |
| 10 | + |
| 11 | + |
| 12 | +@dataclass(frozen=True) |
| 13 | +class GitPathSpec: |
| 14 | + """Support class for patterns used to limit paths in Git commands |
| 15 | +
|
| 16 | + From the Git documentation: |
| 17 | +
|
| 18 | + Pathspecs are used on the command line of "git ls-files", "git ls-tree", |
| 19 | + "git add", "git grep", "git diff", "git checkout", and many other |
| 20 | + commands to limit the scope of operations to some subset of the tree |
| 21 | + or working tree. |
| 22 | +
|
| 23 | + Apart from providing a dedicated type for a pathspec, the main purpose |
| 24 | + of this functionality is to take a pathspec that is valid in the context |
| 25 | + of one (top-level) repository, and translate it such that the set of |
| 26 | + pathspecs given to the same command running on/in a submodule/subdirectory |
| 27 | + gives the same results, as if the initial top-level invocation reported |
| 28 | + them (if it even could). See the ``for_subdir()`` method for more. |
| 29 | +
|
| 30 | + >>> # simple stripping of leading directory |
| 31 | + >>> ps = GitPathSpec.from_pathspec_str('dir/*.jpg') |
| 32 | + >>> [str(i) for i in ps.for_subdir('dir')] |
| 33 | + ['*.jpg'] |
| 34 | + >>> # match against magic pathspecs |
| 35 | + >>> ps = GitPathSpec.from_pathspec_str(':(glob)**r/*.jpg') |
| 36 | + >>> # longest and shortest match are produced |
| 37 | + >>> [str(i) for i in ps.for_subdir('dir')] |
| 38 | + [':(glob)**r/*.jpg', ':(glob)*.jpg'] |
| 39 | + >>> [str(i) for i in ps.for_subdir('root/some/dir')] |
| 40 | + [':(glob)**r/*.jpg', ':(glob)*.jpg'] |
| 41 | + >>> # support for special 'no-pathspec' pathspec |
| 42 | + >>> ps = GitPathSpec.from_pathspec_str(':') |
| 43 | + >>> ps.is_nopathspecs |
| 44 | + True |
| 45 | +
|
| 46 | + .. seealso:: |
| 47 | +
|
| 48 | + - Entry in the Git glossary: |
| 49 | + https://git-scm.com/docs/gitglossary#Documentation/gitglossary.txt-aiddefpathspecapathspec |
| 50 | + - Informative, more elaborate description of pathspecs: |
| 51 | + https://css-tricks.com/git-pathspecs-and-how-to-use-them/ |
| 52 | + """ |
| 53 | + |
| 54 | + # TODO: think about adding support for another magic that represents |
| 55 | + # the root of a repository hierarchy (amending 'top', which is |
| 56 | + # the root of the working tree -- but presumably for a single repository |
| 57 | + spectypes: tuple[str, ...] |
| 58 | + """Long-form pathspec type identifiers""" |
| 59 | + dirprefix: str | None |
| 60 | + """Directory prefix (pathspec up to the last slash) limiting the scope""" |
| 61 | + pattern: str | None |
| 62 | + """Pattern to match paths against using ``fnmatch``""" |
| 63 | + |
| 64 | + @property |
| 65 | + def is_nopathspecs(self) -> bool: |
| 66 | + """Whether this pathspec is the "no pathspecs" pathspec, AKA ``':'``""" |
| 67 | + return not self.spectypes and not self.dirprefix and not self.pattern |
| 68 | + |
| 69 | + def __str__(self) -> str: |
| 70 | + """Generate normalized (long-form) pathspec""" |
| 71 | + if self.is_nopathspecs: |
| 72 | + return ':' |
| 73 | + ps = '' |
| 74 | + if self.spectypes: |
| 75 | + ps += ':(' |
| 76 | + ps += ','.join(self.spectypes) |
| 77 | + ps += ')' |
| 78 | + ps += self.get_joined_pattern() |
| 79 | + return ps |
| 80 | + |
| 81 | + def get_joined_pattern(self): |
| 82 | + return ( |
| 83 | + f'{self.dirprefix if self.dirprefix else ""}' |
| 84 | + f'{"/" if self.dirprefix else ""}' |
| 85 | + f'{self.pattern if self.pattern else ""}' |
| 86 | + ) |
| 87 | + |
| 88 | + def for_subdir(self, subdir: str) -> list[GitPathSpec]: |
| 89 | + """Translate a pathspec into the scope of a subdirectory. |
| 90 | +
|
| 91 | + The processing implemented here is purely lexical. This means that it |
| 92 | + works without matching against actual file system (or Git tree) |
| 93 | + content. Consequently, to some degree, overly broad results are |
| 94 | + produced, but at the same time use cases are supported where there |
| 95 | + is nothing (yet) to match against (e.g., a not-yet-cloned submodule). |
| 96 | +
|
| 97 | + A pathspec with a ``top`` magic is produced unmodified, as there are |
| 98 | + defined relative to the root of a repository, not relative to a base |
| 99 | + directory. As a consequence, such pathspecs will automatically |
| 100 | + refer to a submodule root when the target directory is contained in |
| 101 | + one. |
| 102 | +
|
| 103 | + Parameters |
| 104 | + ---------- |
| 105 | + subdir: str |
| 106 | + Relative path in POSIX notation |
| 107 | +
|
| 108 | + Returns |
| 109 | + ------- |
| 110 | + list |
| 111 | + When an empty list is returned, this indicates that the pathsspec |
| 112 | + cannot be translated to the given ``subdir``, because it does |
| 113 | + not match the ``subdir`` itself. If a pathspec translates to |
| 114 | + "no pathspecs" (``':'``), a list with a dedicated ':' pathspec is |
| 115 | + returned. |
| 116 | + """ |
| 117 | + # special case of a non-translation (pretty much only here to |
| 118 | + # make some test implementations simpler |
| 119 | + if not subdir: |
| 120 | + return [self] |
| 121 | + |
| 122 | + return list(yield_subdir_match_remainder_pathspecs(subdir, self)) |
| 123 | + |
| 124 | + @classmethod |
| 125 | + def from_pathspec_str( |
| 126 | + cls, |
| 127 | + pathspec: str, |
| 128 | + ) -> GitPathSpec: |
| 129 | + """Parse a string-form pathspec into types, prefix, and pattern""" |
| 130 | + spectypes: list[str] = [] |
| 131 | + dirprefix = None |
| 132 | + pattern = None |
| 133 | + |
| 134 | + if pathspec == ':': |
| 135 | + # shortcut for the special no-path-spec pathspec |
| 136 | + return GitPathSpec((), '', None) |
| 137 | + |
| 138 | + if pathspec.startswith(':('): |
| 139 | + # long-form magic |
| 140 | + magic, pattern = pathspec[2:].split(')', maxsplit=1) |
| 141 | + spectypes = magic.split(',') |
| 142 | + elif pathspec.startswith(':'): |
| 143 | + # short-form magic |
| 144 | + spectypes, pattern = _pathspec_from_shortform(pathspec) |
| 145 | + else: |
| 146 | + pattern = pathspec |
| 147 | + |
| 148 | + # raise when glob and literal magic markers are present |
| 149 | + # simultaneously |
| 150 | + if 'glob' in spectypes and 'literal' in spectypes: |
| 151 | + msg = "'glob' magic is incompatible with 'literal' magic" |
| 152 | + raise ValueError(msg) |
| 153 | + |
| 154 | + # split off dirprefix |
| 155 | + dirprefix, pattern = _split_prefix_pattern(pattern) |
| 156 | + |
| 157 | + return cls( |
| 158 | + spectypes=tuple(spectypes), |
| 159 | + dirprefix=dirprefix, |
| 160 | + pattern=pattern, |
| 161 | + ) |
| 162 | + |
| 163 | + |
| 164 | +def _pathspec_from_shortform(spec: str) -> tuple[list[str], str]: |
| 165 | + # short-form magic |
| 166 | + magic_signatures = { |
| 167 | + '/': 'top', |
| 168 | + '!': 'exclude', |
| 169 | + '^': 'exclude', |
| 170 | + ':': None, |
| 171 | + } |
| 172 | + pattern = spec[1:] |
| 173 | + spectypes: list[str] = [] |
| 174 | + for i in range(1, len(spec)): |
| 175 | + sig = magic_signatures.get(spec[i]) |
| 176 | + if sig is None: |
| 177 | + return (spectypes, spec[i:]) |
| 178 | + spectypes.append(sig) |
| 179 | + return (spectypes, pattern) |
| 180 | + |
| 181 | + |
| 182 | +def _split_prefix_pattern(pathspec: str) -> tuple[str | None, str | None]: |
| 183 | + # > the pathspec up to the last slash represents a directory prefix. |
| 184 | + # > The scope of that pathspec is limited to that subtree. |
| 185 | + pattern: str | None = None |
| 186 | + try: |
| 187 | + last_slash_idx = pathspec[::-1].index('/') |
| 188 | + except ValueError: |
| 189 | + # everything is the pattern |
| 190 | + dirprefix = None |
| 191 | + pattern = pathspec |
| 192 | + else: |
| 193 | + dirprefix = pathspec[: -last_slash_idx - 1] |
| 194 | + pattern = pathspec[-last_slash_idx:] if last_slash_idx > 0 else None |
| 195 | + return dirprefix, pattern |
| 196 | + |
| 197 | + |
| 198 | +def yield_subdir_match_remainder_pathspecs( |
| 199 | + subdir: str, |
| 200 | + pathspec: GitPathSpec, |
| 201 | +) -> Generator[GitPathSpec, None, None]: |
| 202 | + """Translate a pathspec into a set of possible subdirectory pathspecs |
| 203 | +
|
| 204 | + The processing implemented here is purely lexical. This means that it |
| 205 | + works without matching against actual file system (or Git tree) content. |
| 206 | + This means that it yields, to some degree, overly broad results, but also |
| 207 | + that it works in cases where there is nothing (yet) to match against. |
| 208 | + For example, a not-yet-cloned submodule. |
| 209 | +
|
| 210 | + This function does not perform any validatity checking of pathspecs. Only |
| 211 | + valid pathspecs and well-formed paths are supported. |
| 212 | +
|
| 213 | + A pathspec with the ``top`` magic is returned immediately and as-is. These |
| 214 | + pathspecs have an absolute reference and do not require a translation into |
| 215 | + a subdirectory namespace. |
| 216 | +
|
| 217 | + Parameters |
| 218 | + ---------- |
| 219 | + subdir: str |
| 220 | + POSIX-notation relative path of a subdirectory. The reference directory |
| 221 | + match be the same as that of the pathspec to be translated. |
| 222 | + pathspec: GitPathSpec |
| 223 | + To-be-translated pathspec |
| 224 | +
|
| 225 | + Yields |
| 226 | + ------ |
| 227 | + GitPathSpec |
| 228 | + Any number of pathspecs that an input pathspec decomposed into upon |
| 229 | + translation into the namespace of a subdirectory. |
| 230 | + """ |
| 231 | + if 'top' in pathspec.spectypes or pathspec.is_nopathspecs: |
| 232 | + # pathspec with an absolute reference, or "no pathspecs" |
| 233 | + # no translation needed |
| 234 | + yield pathspec |
| 235 | + return |
| 236 | + |
| 237 | + # add a trailing directory separator to prevent undesired |
| 238 | + # matches of partial directory names |
| 239 | + subdir = subdir if subdir.endswith('/') else f'{subdir}/' |
| 240 | + tp = pathspec.get_joined_pattern() |
| 241 | + |
| 242 | + if 'icase' in pathspec.spectypes: |
| 243 | + subdir = subdir.casefold() |
| 244 | + tp = tp.casefold() |
| 245 | + |
| 246 | + # literal pathspecs |
| 247 | + if 'literal' in pathspec.spectypes: |
| 248 | + # append a trailing slash to allow for full matches |
| 249 | + tp_endslash = f'{tp}/' |
| 250 | + if not tp_endslash.startswith(subdir): |
| 251 | + # no match |
| 252 | + # BUT |
| 253 | + # we might have a multi-level subdir, and we might match an |
| 254 | + # intermediate subdir and could still yield a 'no pathspec' |
| 255 | + # result |
| 256 | + while subdir := posixpath.split(subdir)[0]: |
| 257 | + if tp_endslash.startswith(subdir): |
| 258 | + yield GitPathSpec.from_pathspec_str(':') |
| 259 | + return |
| 260 | + return |
| 261 | + |
| 262 | + remainder = tp[len(subdir) :] |
| 263 | + if not remainder: |
| 264 | + # full match |
| 265 | + yield GitPathSpec.from_pathspec_str(':') |
| 266 | + else: |
| 267 | + yield GitPathSpec(pathspec.spectypes, *_split_prefix_pattern(remainder)) |
| 268 | + return |
| 269 | + |
| 270 | + # tokenize the testpattern using the wildcard that also matches |
| 271 | + # directories |
| 272 | + token_delim = '**' if 'glob' in pathspec.spectypes else '*' |
| 273 | + tp_chunks = tp.split(token_delim) |
| 274 | + prefix_match = '' |
| 275 | + yielded = set() |
| 276 | + for i, chunk in enumerate(tp_chunks): |
| 277 | + last_chunk = i + 1 == len(tp_chunks) |
| 278 | + if last_chunk: |
| 279 | + trymatch = f'{prefix_match}{chunk}{"" if chunk.endswith("/") else "/"}' |
| 280 | + else: |
| 281 | + trymatch = f'{prefix_match}{chunk}*' |
| 282 | + if not fnmatch(subdir, f'{trymatch}'): |
| 283 | + # each chunk needs match in order, first non-match ends the |
| 284 | + # algorithm |
| 285 | + # BUT |
| 286 | + # we have an (initial) chunk that points already |
| 287 | + # inside the target subdir |
| 288 | + submatch = trymatch |
| 289 | + while submatch := posixpath.split(submatch)[0]: |
| 290 | + if not fnmatch(f'{subdir}', f'{submatch}/'): |
| 291 | + continue |
| 292 | + ps = GitPathSpec( |
| 293 | + pathspec.spectypes, |
| 294 | + *_split_prefix_pattern( |
| 295 | + # +1 for trailing slash |
| 296 | + tp[len(submatch) + 1 :] |
| 297 | + ), |
| 298 | + ) |
| 299 | + if ps not in yielded: |
| 300 | + yield ps |
| 301 | + return |
| 302 | + # OR |
| 303 | + # we might have a multi-level subdir, and we might match an |
| 304 | + # intermediate subdir and could still yield a 'no pathspec' |
| 305 | + # result |
| 306 | + while subdir := posixpath.split(subdir)[0]: |
| 307 | + if fnmatch(f'{subdir}/', trymatch): |
| 308 | + yield GitPathSpec.from_pathspec_str(':') |
| 309 | + return |
| 310 | + return |
| 311 | + |
| 312 | + remainder = tp_chunks[i + 1 :] |
| 313 | + if all(not c for c in remainder): |
| 314 | + # direct hit, no pathspecs after translation |
| 315 | + yield GitPathSpec.from_pathspec_str(':') |
| 316 | + return |
| 317 | + else: |
| 318 | + ps = GitPathSpec( |
| 319 | + pathspec.spectypes, |
| 320 | + *_split_prefix_pattern( |
| 321 | + f'{token_delim}{token_delim.join(remainder)}', |
| 322 | + ), |
| 323 | + ) |
| 324 | + yield ps |
| 325 | + yielded.add(ps) |
| 326 | + # extend prefix for the next round |
| 327 | + prefix_match = trymatch |
0 commit comments