Skip to content

Commit 1f04ab9

Browse files
committed
feat: Add bidsignore implementation
1 parent 87c1ae3 commit 1f04ab9

File tree

2 files changed

+218
-0
lines changed

2 files changed

+218
-0
lines changed

src/bids_validator/bidsignore.py

+129
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
"""Utilities for working with .bidsignore files."""
2+
3+
import os
4+
import re
5+
from functools import lru_cache
6+
from typing import Protocol, Union
7+
8+
import attrs
9+
10+
from .types.files import FileTree
11+
12+
13+
@lru_cache
14+
def compile_pat(pattern: str) -> Union[re.Pattern, None]:
15+
"""Compile .gitignore-style ignore lines to regular expressions."""
16+
orig = pattern
17+
# A line starting with # serves as a comment.
18+
if pattern.startswith('#'):
19+
return None
20+
21+
# An optional prefix "!" which negates the pattern;
22+
invert = pattern.startswith('!')
23+
24+
# Put a backslash ("\") in front of the first hash for patterns that begin with a hash.
25+
# Put a backslash ("\") in front of the first "!" for patterns that begin with a literal "!"
26+
if pattern.startswith((r'\#', r'\!')):
27+
pattern = pattern[1:] # Unescape
28+
29+
# Trailing spaces are ignored unless they are quoted with backslash ("\").
30+
pattern = re.sub(r'(?<!\\) +$', '', pattern)
31+
32+
# A blank line matches no files, so it can serve as a separator for readability.
33+
if pattern == '':
34+
return None
35+
36+
# If there is a separator at the beginning or middle (or both) of the pattern,
37+
# then the pattern is relative to the [root]
38+
relative_match = pattern == '/' or '/' in pattern[:-1]
39+
# If there is a separator at the end of the pattern then the pattern will only match
40+
# directories, otherwise the pattern can match both files and directories.
41+
directory_match = pattern.endswith('/')
42+
43+
# This does not handle character ranges correctly except when they are also valid regex
44+
parts = [
45+
'.*'
46+
if part == '**'
47+
else part.replace('*', '[^/]*').replace('?', '[^/]').replace('.', r'\.')
48+
for part in pattern.strip('/').split('/')
49+
]
50+
51+
prefix = '^' if relative_match else '^(.*/|)'
52+
postfix = r'/\Z' if directory_match else r'/?\Z'
53+
54+
# "**/" matches zero or more directories, so the separating slash needs to be optional
55+
out_pattern = '/'.join(parts).replace('.*/', '.*/?')
56+
out_pattern = f'{prefix}{out_pattern}{postfix}'
57+
58+
if invert:
59+
raise ValueError(f'Inverted patterns not supported: {orig}')
60+
# out_pattern = f'(?!{out_pattern})'
61+
62+
return re.compile(out_pattern)
63+
64+
65+
class HasMatch(Protocol): # noqa: D101
66+
def match(self, relpath: str) -> bool: ... # noqa: D102
67+
68+
69+
@attrs.define
70+
class Ignore:
71+
"""Collection of .gitignore-style patterns.
72+
73+
Tracks successfully matched files for reporting.
74+
"""
75+
76+
patterns: list[str] = attrs.field(factory=list)
77+
history: list[str] = attrs.field(factory=list, init=False)
78+
79+
@classmethod
80+
def from_file(cls, pathlike: os.PathLike):
81+
"""Load Ignore contents from file."""
82+
with open(pathlike) as fobj:
83+
return cls([line.rstrip('\n') for line in fobj])
84+
85+
def match(self, relpath: str) -> bool:
86+
"""Match a relative path against a collection of ignore patterns."""
87+
if any(compile_pat(pattern).match(relpath) for pattern in self.patterns):
88+
self.history.append(relpath)
89+
return True
90+
return False
91+
92+
93+
@attrs.define
94+
class IgnoreMany:
95+
"""Match against several ignore filters."""
96+
97+
ignores: list[Ignore] = attrs.field()
98+
99+
def match(self, relpath: str) -> bool:
100+
"""Return true if any filters match the given file.
101+
102+
Will short-circuit, so ordering is significant for side-effects,
103+
such as recording files ignored by a particular filter.
104+
"""
105+
return any(ignore.match(relpath) for ignore in self.ignores)
106+
107+
108+
def filter_file_tree(filetree: FileTree) -> FileTree:
109+
"""Read .bidsignore and filter file tree."""
110+
bidsignore = filetree.children.get('.bidsignore')
111+
if not bidsignore:
112+
return filetree
113+
ignore = IgnoreMany([Ignore.from_file(bidsignore), Ignore(['/.bidsignore'])])
114+
return _filter(filetree, ignore)
115+
116+
117+
def _filter(filetree: FileTree, ignore: HasMatch) -> FileTree:
118+
items = filetree.children.items()
119+
children = {
120+
name: _filter(child, ignore)
121+
for name, child in items
122+
if not ignore.match(child.relative_path)
123+
}
124+
125+
# XXX This check may not be worth the time. Profile this.
126+
if any(children.get(name) is not child for name, child in items):
127+
filetree = attrs.evolve(filetree, children=children)
128+
129+
return filetree

tests/test_bidsignore.py

+89
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
"""Test bids_validator.bidsignore."""
2+
3+
import pytest
4+
5+
from bids_validator.bidsignore import Ignore, compile_pat, filter_file_tree
6+
from bids_validator.types.files import FileTree
7+
8+
9+
@pytest.mark.parametrize(
10+
('pattern', 'hits', 'misses'),
11+
[
12+
('/', ['/'], ['dir/', 'file']),
13+
# Match file or directory named foo
14+
('foo', ['foo', 'foo/', 'bar/foo', 'bar/foo/'], ['bar', 'foobar', 'barfoo', 'barfoo/']),
15+
# Directories named foo only
16+
('foo/', ['foo/', 'bar/foo/'], ['foo', 'bar/foo', 'bar', 'foobar', 'barfoo', 'barfoo/']),
17+
# Files or directories at the root
18+
('/foo', ['foo', 'foo/'], ['bar/foo', 'bar/foo/', 'bar', 'foobar', 'barfoo', 'barfoo/']),
19+
# doc/frotz/ examples from GITIGNORE(5)
20+
('doc/frotz/', ['doc/frotz/'], ['a/doc/frotz/']),
21+
('frotz/', ['frotz/', 'doc/frotz/', 'a/doc/frotz/'], []),
22+
# * matches everything because everything has a basename
23+
('*', ['foo', 'foo/', 'foo/bar', 'foo/bar/'], []),
24+
# *o matches things with basename ending in o
25+
('*o', ['foo', 'foo/', 'bar/foo', 'bar/foo/'], ['foo/bar', 'foo/bar/']),
26+
# Leading **/ matches in all directories
27+
('**/foo', ['foo', 'foo/', 'bar/foo', 'bar/foo/'], ['foo/bar', 'foo/bar/', 'baz/foobar']),
28+
('**/foo/bar', ['foo/bar', 'foo/bar/', 'a/foo/bar'], ['foo/', 'bar/foo', 'bar']),
29+
# Trailing /** matches everything inside a root-relative directory
30+
('foo/**', ['foo/', 'foo/x', 'foo/x/y/z'], ['foo', 'bar/foo/x/y/z']),
31+
# /**/ matches zero or more directories
32+
('a/**/b', ['a/b', 'a/x/b', 'a/x/y/b'], ['x/a/b', 'x/a/y/b']),
33+
# ** surrounded by something other than slashes acts like a regular *
34+
('a/x**/b', ['a/x/b', 'a/xy/b'], ['x/a/b', 'x/a/y/b', 'a/x/y/b']),
35+
# Escaped special prefixes
36+
(r'\#*', ['#', '#foo'], ['foo', 'bar#']),
37+
(r'\!*', ['!', '!foo'], ['foo', 'bar!']),
38+
],
39+
)
40+
def test_patterns(pattern, hits, misses):
41+
"""Test expected hits and misses of ignore patterns."""
42+
regex = compile_pat(pattern)
43+
for fname in hits:
44+
assert regex.match(fname)
45+
for fname in misses:
46+
assert not regex.match(fname)
47+
48+
49+
def test_skipped_patterns():
50+
"""Test ignore patterns that should match nothing."""
51+
assert compile_pat('') is None
52+
assert compile_pat('# commented line') is None
53+
assert compile_pat(' ') is None
54+
with pytest.raises(ValueError, match='Inverted patterns not supported'):
55+
compile_pat('!inverted pattern')
56+
57+
58+
def test_Ignore_ds000117(examples):
59+
"""Test that we can load a .bidsignore file and match a file."""
60+
ds000117 = FileTree.read_from_filesystem(examples / 'ds000117')
61+
ignore = Ignore.from_file(ds000117.children['.bidsignore'])
62+
assert 'run-*_echo-*_FLASH.json' in ignore.patterns
63+
assert 'sub-01/ses-mri/anat/sub-01_ses-mri_run-1_echo-1_FLASH.nii.gz' in ds000117
64+
assert ignore.match('sub-01/ses-mri/anat/sub-01_ses-mri_run-1_echo-1_FLASH.nii.gz')
65+
assert not ignore.match('acq-mprage_T1w.json')
66+
flash_file = (
67+
ds000117.children['sub-01']
68+
.children['ses-mri']
69+
.children['anat']
70+
.children['sub-01_ses-mri_run-1_echo-1_FLASH.nii.gz']
71+
)
72+
assert ignore.match(flash_file.relative_path)
73+
74+
75+
def test_filter_file_tree(examples):
76+
"""Test file tree filtering with .bidsignore."""
77+
ds000117 = FileTree.read_from_filesystem(examples / 'ds000117')
78+
assert '.bidsignore' in ds000117
79+
assert 'sub-01/ses-mri/anat/sub-01_ses-mri_run-1_echo-1_FLASH.nii.gz' in ds000117
80+
81+
filtered = filter_file_tree(ds000117)
82+
assert '.bidsignore' not in filtered
83+
assert 'sub-01/ses-mri/anat/sub-01_ses-mri_run-1_echo-1_FLASH.nii.gz' not in filtered
84+
85+
ds000247 = FileTree.read_from_filesystem(examples / 'ds000247')
86+
assert '.bidsignore' not in ds000247
87+
88+
filtered = filter_file_tree(ds000247)
89+
assert filtered is ds000247

0 commit comments

Comments
 (0)