Merge pull request #326 from bids-standard/consistent-entities

Ensure consistent entities at all levels
bids-standard · Feb 1, 2019 · f2a333c · f2a333c
2 parents 99178b7 + cf95dd7
commit f2a333c
Show file tree

Hide file tree

Showing 5 changed files with 36 additions and 34 deletions.
diff --git a/bids/analysis/tests/test_analysis.py b/bids/analysis/tests/test_analysis.py
@@ -31,25 +31,28 @@ def test_get_design_matrix_arguments(analysis):
     kwargs = dict(run=1, subject='01', sparse=True)
     result = analysis['run'].get_design_matrix(**kwargs)
     result = result[0]
-    assert result.sparse.shape == (172, 7)
+    assert result.sparse.shape == (172, 9)
     assert result.dense is None
 
     kwargs = dict(run=1, subject='01', mode='dense', force=False)
     result = analysis['run'].get_design_matrix(**kwargs)[0]
     assert result.sparse is None
     assert result.dense is None
 
-    kwargs = dict(run=1, subject='01', mode='dense', force=True, sampling_rate='highest')
+    kwargs = dict(run=1, subject='01', mode='dense', force=True,
+                  sampling_rate='highest')
     result = analysis['run'].get_design_matrix(**kwargs)[0]
     assert result.sparse is None
     assert result.dense.shape == (4800, 6)
 
-    kwargs = dict(run=1, subject='01', mode='dense', force=True, sampling_rate='TR')
+    kwargs = dict(run=1, subject='01', mode='dense', force=True,
+                  sampling_rate='TR')
     result = analysis['run'].get_design_matrix(**kwargs)[0]
     assert result.sparse is None
     assert result.dense.shape == (240, 6)
 
-    kwargs = dict(run=1, subject='01', mode='dense', force=True, sampling_rate=0.5)
+    kwargs = dict(run=1, subject='01', mode='dense', force=True,
+                  sampling_rate=0.5)
     result = analysis['run'].get_design_matrix(**kwargs)[0]
     assert result.sparse is None
     assert result.dense.shape == (240, 6)
@@ -72,11 +75,11 @@ def test_first_level_sparse_design_matrix(analysis):
     result = analysis['run'].get_design_matrix(subject=['01'])
     assert len(result) == 3
     df = result[0].sparse
-    assert df.shape == (172, 7)
+    assert df.shape == (172, 9)
     assert df['condition'].nunique() == 2
     assert set(result[0][0].columns) == {'amplitude', 'onset', 'duration',
                                          'condition', 'subject', 'run',
-                                         'task'}
+                                         'task', 'datatype', 'suffix'}
 
 
 def test_post_first_level_sparse_design_matrix(analysis):
@@ -87,7 +90,9 @@ def test_post_first_level_sparse_design_matrix(analysis):
     assert result[0].sparse.shape == (9, 2)
     assert result[0].entities == {
         'subject': '01',
-        'task': 'mixedgamblestask'}
+        'task': 'mixedgamblestask',
+        'datatype': 'func',
+        'suffix': 'bold'}
 
     # Participant level and also check integer-based indexing
     result = analysis['participant'].get_design_matrix()

diff --git a/bids/variables/io.py b/bids/variables/io.py
@@ -201,7 +201,7 @@ def _load_time_variables(layout, dataset=None, columns=None, scan_length=None,
                         # Add in all of the run's entities as new columns for
                         # index
                         for entity, value in entities.items():
-                            if entity in BASE_ENTITIES:
+                            if entity in ALL_ENTITIES:
                                 df[entity] = value
 
                         if drop_na:
@@ -327,14 +327,20 @@ def _load_tsv_variables(layout, suffix, dataset=None, columns=None,
         # file (for entities that vary by row), or from the full file path
         # (for entities constant over all rows in the file). We extract both
         # and store them in the main DataFrame alongside other variables (as
-        # they'll be extracted when the Column is initialized anyway).
+        # they'll be extracted when the BIDSVariable is initialized anyway).
         for ent_name, ent_val in f.entities.items():
-            if ent_name in BASE_ENTITIES:
+            if ent_name in ALL_ENTITIES:
                 _data[ent_name] = ent_val
 
         # Handling is a bit more convoluted for scans.tsv, because the first
         # column contains the run filename, which we also need to parse.
         if suffix == 'scans':
+
+            # Suffix is guaranteed to be present in each filename, so drop the
+            # constant column with value 'scans' to make way for it and prevent
+            # two 'suffix' columns.
+            _data.drop(columns='suffix', inplace=True)
+
             image = _data['filename']
             _data = _data.drop('filename', axis=1)
             dn = f.dirname
@@ -369,12 +375,11 @@ def make_patt(x, regex_search=False):
         # Filter rows on all selectors
         comm_cols = list(set(_data.columns) & set(selectors.keys()))
         for col in comm_cols:
-            for val in listify(selectors.get(col)):
-                ent_patts = [make_patt(x, regex_search=layout.regex_search)
-                             for x in listify(selectors.get(col))]
-                patt = '|'.join(ent_patts)
+            ent_patts = [make_patt(x, regex_search=layout.regex_search)
+                            for x in listify(selectors.get(col))]
+            patt = '|'.join(ent_patts)
 
-                _data = _data[_data[col].str.contains(patt)]
+            _data = _data[_data[col].str.contains(patt)]
 
         level = {'scans': 'session', 'sessions': 'subject',
                  'participants': 'dataset'}[suffix]

diff --git a/bids/variables/tests/test_collections.py b/bids/variables/tests/test_collections.py
@@ -57,35 +57,27 @@ def test_run_variable_collection_to_df(run_coll):
 
     # All variables sparse, wide format
     df = run_coll.to_df()
-    assert df.shape == (4096, 13)
+    assert df.shape == (4096, 15)
     wide_cols = {'onset', 'duration', 'subject', 'run', 'task',
                  'PTval', 'RT', 'gain', 'loss', 'parametric gain', 'respcat',
-                 'respnum', 'trial_type'}
+                 'respnum', 'trial_type', 'suffix', 'datatype'}
     assert set(df.columns) == wide_cols
 
     # All variables sparse, wide format
     df = run_coll.to_df(format='long')
-    assert df.shape == (32768, 7)
+    assert df.shape == (32768, 9)
     long_cols = {'amplitude', 'duration', 'onset', 'condition', 'run',
-                 'task', 'subject'}
+                 'task', 'subject', 'suffix', 'datatype'}
     assert set(df.columns) == long_cols
 
     # All variables dense, wide format
     df = run_coll.to_df(sparse=False)
     assert df.shape == (230400, 14)
-    # The inclusion of 'modality' and 'type' here is a minor bug that should
-    # be fixed at some point. There is no reason why to_df() should return
-    # more columns for a DenseRunVariable than a SparseRunVariable, but this
-    # is happening because these columns are not included in the original
-    # SparseRunVariable data, and are being rebuilt from the entity list in
-    # the DenseRunVariable init.
-    wide_cols |= {'datatype', 'suffix'}
     assert set(df.columns) == wide_cols - {'trial_type'}
 
     # All variables dense, wide format
     df = run_coll.to_df(sparse=False, format='long')
     assert df.shape == (1612800, 9)
-    long_cols |= {'datatype', 'suffix'}
     assert set(df.columns) == long_cols
 
 
@@ -100,14 +92,14 @@ def test_merge_collections(run_coll, run_coll_list):
 def test_get_collection_entities(run_coll_list):
     coll = run_coll_list[0]
     ents = coll.entities
-    assert {'run', 'task', 'subject'} == set(ents.keys())
+    assert {'run', 'task', 'subject', 'suffix', 'datatype'} == set(ents.keys())
 
     merged = merge_collections(run_coll_list[:3])
     ents = merged.entities
-    assert {'task', 'subject'} == set(ents.keys())
+    assert {'task', 'subject', 'suffix', 'datatype'} == set(ents.keys())
     assert ents['subject'] == '01'
 
     merged = merge_collections(run_coll_list[3:6])
     ents = merged.entities
-    assert {'task', 'subject'} == set(ents.keys())
+    assert {'task', 'subject', 'suffix', 'datatype'} == set(ents.keys())
     assert ents['subject'] == '02'
diff --git a/bids/variables/tests/test_entities.py b/bids/variables/tests/test_entities.py
@@ -69,7 +69,7 @@ def test_get_collections_merged(layout1):
     vals = collection.variables['RT'].values
     ents = collection.variables['RT'].index
     assert len(ents) == len(vals) == 4096
-    assert set(ents.columns) == {'task', 'run', 'subject'}
+    assert set(ents.columns) == {'task', 'run', 'subject', 'suffix', 'datatype'}
 
 
 def test_get_collections_unmerged(layout2):

diff --git a/bids/variables/tests/test_io.py b/bids/variables/tests/test_io.py
@@ -38,7 +38,7 @@ def test_load_events(layout1):
     targ_cols = {'parametric gain', 'PTval', 'trial_type', 'respnum'}
     assert not (targ_cols - set(variables.keys()))
     assert isinstance(variables['parametric gain'], SparseRunVariable)
-    assert variables['parametric gain'].index.shape == (86, 3)
+    assert variables['parametric gain'].index.shape == (86, 5)
     assert variables['parametric gain'].source == 'events'
 
 
@@ -51,12 +51,12 @@ def test_load_participants(layout1):
     assert {'age', 'sex'} == set(dataset.variables.keys())
     age = dataset.variables['age']
     assert isinstance(age, SimpleVariable)
-    assert age.index.shape == (16, 1)
+    assert age.index.shape == (16, 2)
     assert age.values.shape == (16,)
 
     index = load_variables(layout1, types='participants', subject=['^1.*'])
     age = index.get_nodes(level='dataset')[0].variables['age']
-    assert age.index.shape == (7, 1)
+    assert age.index.shape == (7, 2)
     assert age.values.shape == (7,)