Skip to content

Commit

Permalink
add extra checks for warnings of sets metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
LoannPeurey committed Jan 22, 2025
1 parent e85c5cc commit e7cba61
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 58 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ annotation_algorithm_repo: 'https://github.com/LoannPeurey/ALICE/tree/cae98d47a1
date_annotation: '2024-04-07'
has_speaker_type: 'Y'
has_words: 'Y'
invented: "made up field to trigger warning"
9 changes: 0 additions & 9 deletions examples/valid_raw_data/annotations/old_its/metannots.yml

This file was deleted.

1 change: 1 addition & 0 deletions examples/valid_raw_data/annotations/old_its/metannots.yml
1 change: 1 addition & 0 deletions examples/valid_raw_data/annotations/textgrid/metannots.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ date_annotation: '2019-04-05'
has_speaker_type: 'Y'
has_vcm_type: 'Y'
has_addressee: 'Y'
random_field: 34
9 changes: 0 additions & 9 deletions examples/valid_raw_data/annotations/vtc_rttm/metannots.yml

This file was deleted.

74 changes: 45 additions & 29 deletions tests/test_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def project(request):
if os.path.exists(PATH):
# shutil.copytree(src="examples/valid_raw_data", dst="output/annotations")
shutil.rmtree(PATH)
shutil.copytree(src="examples/valid_raw_data", dst=PATH)
shutil.copytree(src="examples/valid_raw_data", dst=PATH, symlinks=True)

project = ChildProject(PATH)

Expand Down Expand Up @@ -196,7 +196,9 @@ def test_import(project, am):
assert len(errors) == 0 and len(warnings) == 0, "malformed annotations detected"

errors, warnings = am.read()
assert len(errors) == 0 and len(warnings) == 0, "malformed annotation indexes detected"
assert (len(errors) == 0 and
warnings == ["Metadata files for sets ['vtc_rttm'] could not be found, they should be created as annotations/<set>/metannots.yml", "Metadata file content for sets ['old_its'] could not be found, it may be downloaded from a remote with the command `datalad get annotations/**/metannots.yml`", "Metadata files for sets contain the following unknown fields ['invented', 'random_field'] which can be found in the metadata for sets ['alice/output', 'textgrid']"]
), "malformed annotation indexes detected"

for dataset in ["eaf_basic", "textgrid", "eaf_solis"]:
annotations = am.annotations[am.annotations["set"] == dataset]
Expand Down Expand Up @@ -290,7 +292,9 @@ def test_multiple_imports(project, am, input_file, ow, rimported, rerrors, excep
errors, warnings = am.read()
print(errors)
print(warnings)
assert len(errors) == 0 and len(warnings) == 0, "malformed annotation indexes detected"
assert (len(errors) == 0 and
warnings == ["Metadata files for sets ['vtc_rttm'] could not be found, they should be created as annotations/<set>/metannots.yml", "Metadata file content for sets ['old_its'] could not be found, it may be downloaded from a remote with the command `datalad get annotations/**/metannots.yml`", "Metadata files for sets contain the following unknown fields ['invented', 'random_field'] which can be found in the metadata for sets ['alice/output', 'textgrid']"]
), "malformed annotation indexes detected"


def test_import_incorrect_data_types(project, am):
Expand Down Expand Up @@ -765,35 +769,47 @@ def test_set_from_path(project, am):
== "set/subset"
)

sets_metadata_default = pd.DataFrame()
@pytest.mark.parametrize("metadata_exists,warning,return_value,error",
[(True, 'not set', None, ValueError),
(True, 'ignore', sets_metadata_default, None),
(True, 'return', (sets_metadata_default, []), None),
(True, 'log', sets_metadata_default, None),
(False, 'not set', None, ValueError),
(False, 'ignore', sets_metadata_default, None),
(False, 'return', (sets_metadata_default, []), None),
(False, 'log', sets_metadata_default, None),
# TODO : Add testing for all the kinds of warnings?
@pytest.mark.parametrize("metadata_exists,warning,truth_path,warnings,log",
[(True, 'ignore', TRUTH / 'sets_metadata.csv', None, []),
(True, 'return', TRUTH / 'sets_metadata.csv', ["Metadata files for sets ['vtc_rttm'] could not be found, they should be created as annotations/<set>/metannots.yml", "Metadata file content for sets ['old_its'] could not be found, it may be downloaded from a remote with the command `datalad get annotations/**/metannots.yml`", "Metadata files for sets contain the following unknown fields ['random_field', 'invented'] which can be found in the metadata for sets ['textgrid', 'alice/output']"], []),
(True, 'log', TRUTH / 'sets_metadata.csv', None, [('ChildProject.annotations', 30, "Metadata files for sets ['vtc_rttm'] could not be found, they should be created as annotations/<set>/metannots.yml"), ('ChildProject.annotations', 30, "Metadata file content for sets ['old_its'] could not be found, it may be downloaded from a remote with the command `datalad get annotations/**/metannots.yml`"), ('ChildProject.annotations', 30, "Metadata files for sets contain the following unknown fields ['random_field', 'invented'] which can be found in the metadata for sets ['textgrid', 'alice/output']")]),
(False, 'ignore', TRUTH / 'sets_empty_metadata.csv', None, []),
(False, 'return', TRUTH / 'sets_empty_metadata.csv', ["Metadata files for sets ['vtc_rttm', 'textgrid', 'metrics', 'vtc_present', 'new_its', 'eaf_basic', 'alice/output', 'eaf_solis', 'textgrid2'] could not be found, they should be created as annotations/<set>/metannots.yml", "Metadata file content for sets ['old_its'] could not be found, it may be downloaded from a remote with the command `datalad get annotations/**/metannots.yml`"], []),
(False, 'log', TRUTH / 'sets_empty_metadata.csv', None, [('ChildProject.annotations', 30, "Metadata files for sets ['vtc_rttm', 'textgrid', 'metrics', 'vtc_present', 'new_its', 'eaf_basic', 'alice/output', 'eaf_solis', 'textgrid2'] could not be found, they should be created as annotations/<set>/metannots.yml"), ('ChildProject.annotations', 30, "Metadata file content for sets ['old_its'] could not be found, it may be downloaded from a remote with the command `datalad get annotations/**/metannots.yml`")]),
])
def test_read_sets_metadata(project, am, metadata_exists, warning, return_value, error):
def test_read_sets_metadata(project, am, caplog, metadata_exists, warning, truth_path, warnings, log):
# rather than importing the annotation sets (which relies on having the importation work correctly
# just create a fake annotation record that can be used to load metadata
sets = [n if n != 'alice' else 'alice/output' for n in os.listdir(project.path / ANNOTATIONS) if
os.path.isdir(project.path / ANNOTATIONS / n)]
zeros = [0 for i in range(len(sets))]
fields = ['' for i in range(len(sets))]

am.annotations = pd.DataFrame({'set': sets, 'range_onset': zeros, 'range_offset': zeros,
'annotation_filename': fields, 'raw_filename': fields})

if not metadata_exists:
for set in am.annotations['set'].unique():
if os.path.exists(project.path / set / METANNOTS):
os.remove(project.path / set / METANNOTS)
if error is not None:
with pytest.raises(error):
am._read_sets_metadata(warning)
else:
result = am._read_sets_metadata(warning)
if type(return_value) == tuple:
print(result[0])
assert return_value[1] == result[1]
pd.testing.assert_frame_equal(return_value[0], result[0])
else:
print(result)
pd.testing.assert_frame_equal(return_value, result)
if os.path.exists(project.path / ANNOTATIONS / set / METANNOTS):
os.remove(project.path / ANNOTATIONS / set / METANNOTS)

dtypes = {f.name: f.dtype if f.dtype is not None else 'string' for f in AnnotationManager.SETS_COLUMNS}
truth_df = pd.read_csv(truth_path, index_col='set', dtype=dtypes).drop(columns='duration')
return_value = (truth_df, warnings) if warnings is not None else truth_df

result = am._read_sets_metadata(warning)

capt_log = caplog.record_tuples
# assert capt_stdout == stdout
assert capt_log == log

if type(return_value) == tuple:
assert result[1] == return_value[1]
pd.testing.assert_frame_equal(return_value[0], result[0], check_like=True, check_dtype=False)
else:
print(result)
pd.testing.assert_frame_equal(return_value, result, check_like=True, check_dtype=False)

@pytest.mark.parametrize("metadata_exists,return_value",
[(True, TRUTH / 'sets_metadata.csv'),
Expand All @@ -818,7 +834,7 @@ def test_get_sets_metadata(project, am, metadata_exists, return_value):
result = am.get_sets_metadata()
# result.to_csv(return_value, index_label='set')
dtypes = {f.name: f.dtype if f.dtype is not None else 'string' for f in AnnotationManager.SETS_COLUMNS}
pd.testing.assert_frame_equal(pd.read_csv(return_value, index_col='set', dtype=dtypes), result, check_like=True)
pd.testing.assert_frame_equal(pd.read_csv(return_value, index_col='set', dtype=dtypes), result, check_like=True, check_dtype=False)


# its
Expand Down
22 changes: 11 additions & 11 deletions tests/truth/sets_metadata.csv
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
set,segmentation,segmentation_type,method,annotation_algorithm_name,annotation_algorithm_publication,annotation_algorithm_version,annotation_algorithm_repo,date_annotation,has_speaker_type,sampling_method,sampling_target,sampling_count,sampling_unit_duration,recording_selection,participant_selection,annotator_name,annotator_experience,has_vcm_type,has_addressee,has_words,has_transcription,has_interactions,has_acoustics,duration
vtc_rttm,vtc,permissive,automated,VTC,"Lavechin, M., Bousbib, R., Bredin, H., Dupoux, E., & Cristia, A. (2020). An open-source voice type classifier for child-centered daylong recordings. Interspeech. Online open access: https://www.isca-archive.org/interspeech_2020/lavechin20_interspeech.pdf",1,https://github.com/MarvinLvn/voice-type-classifier/tree/e443d8cfc40f7076eea903958d9344d4aa427cc2,2024-04-14,Y,,,,,,,,,,,,,,,0
textgrid,textgrid,permissive,manual,,,,,2019-04-05,Y,random,chi,10,120000,daytime recordings,have siblings,Gabin Fournier,5,Y,Y,,,,,0
old_its,its,restrictive,automated,ITS,"Xu, D., Yapanel, U., & Gray, S. ( 2009). Reliability of the LENA Language Environment Analysis System in young children’s natural home environment. Boulder, CO: LENA Foundation. Retrieved March 26, 2009, from http://www.lenafoundation.org/Research/TechnicalReports.aspx",1,,2024-05-07,Y,,,,,,,,,,,Y,,,,0
metrics,vtc,permissive,derivation,,,,,2024-06-23,Y,,,,,,,,,,,,,,,0
vtc_present,vtc,permissive,automated,VTC,"Lavechin, M., Bousbib, R., Bredin, H., Dupoux, E., & Cristia, A. (2020). An open-source voice type classifier for child-centered daylong recordings. Interspeech. Online open access: https://www.isca-archive.org/interspeech_2020/lavechin20_interspeech.pdf",1,https://github.com/MarvinLvn/voice-type-classifier/tree/e443d8cfc40f7076eea903958d9344d4aa427cc2,2024-04-07,Y,,,,,,,,,,,,,,,0
new_its,its,restrictive,automated,ITS,"Xu, D., Yapanel, U., & Gray, S. ( 2009). Reliability of the LENA Language Environment Analysis System in young children’s natural home environment. Boulder, CO: LENA Foundation. Retrieved March 26, 2009, from http://www.lenafoundation.org/Research/TechnicalReports.aspx",1,,2024-05-07,Y,,,,,,,,,,,Y,,,,0
eaf_basic,eaf_basic,permissive,manual,,,,,2022-03-17,Y,periodic,,15,3000,all the recordings,all the participants,Valentino Mayer,2,Y,,,Y,N,,0
alice/output,vtc,permissive,automated,ALICE,"Räsänen, O., Seshadri, S., Lavechin, M., Cristia, A. & Casillas, M. (in press): ALICE: An open-source tool for automatic linguistic unit count estimation from child-centered daylong recordings. Behavior Research Methods. Online open acccess: https://link.springer.com/article/10.3758/s13428-020-01460-x.",1,https://github.com/LoannPeurey/ALICE/tree/cae98d47a1e16b19bc7452a3984e915839363373,2024-04-07,Y,,,,,,,,,,,Y,,,,0
eaf_solis,eaf_basic,permissive,manual,,,,,2023-10-02,Y,high-volubility,,20,60000,all the recordings,all the participants,Marie Littelamb,5,Y,Y,,,,Y,0
textgrid2,textgrid2,permissive,manual,,,,,2019-07-16,Y,high-volubility,fem,17,50000,all recordings,1 to 2 yo,Ivan Ciao,5,Y,N,,Y,,,0
set,segmentation,segmentation_type,method,annotation_algorithm_name,annotation_algorithm_publication,annotation_algorithm_version,annotation_algorithm_repo,date_annotation,has_speaker_type,sampling_method,sampling_target,sampling_count,sampling_unit_duration,recording_selection,participant_selection,annotator_name,annotator_experience,has_vcm_type,has_addressee,has_words,has_transcription,has_interactions,has_acoustics,invented,random_field,duration
vtc_rttm,vtc,permissive,automated,VTC,"Lavechin, M., Bousbib, R., Bredin, H., Dupoux, E., & Cristia, A. (2020). An open-source voice type classifier for child-centered daylong recordings. Interspeech. Online open access: https://www.isca-archive.org/interspeech_2020/lavechin20_interspeech.pdf",1,https://github.com/MarvinLvn/voice-type-classifier/tree/e443d8cfc40f7076eea903958d9344d4aa427cc2,2024-04-14,Y,,,,,,,,,,,,,,,,,0
textgrid,textgrid,permissive,manual,,,,,2019-04-05,Y,random,chi,10,120000,daytime recordings,have siblings,Gabin Fournier,5,Y,Y,,,,,,34,0
old_its,its,restrictive,automated,ITS,"Xu, D., Yapanel, U., & Gray, S. ( 2009). Reliability of the LENA Language Environment Analysis System in young children’s natural home environment. Boulder, CO: LENA Foundation. Retrieved March 26, 2009, from http://www.lenafoundation.org/Research/TechnicalReports.aspx",1,,2024-05-07,Y,,,,,,,,,,,Y,,,,,,0
metrics,vtc,permissive,derivation,,,,,2024-06-23,Y,,,,,,,,,,,,,,,,,0
vtc_present,vtc,permissive,automated,VTC,"Lavechin, M., Bousbib, R., Bredin, H., Dupoux, E., & Cristia, A. (2020). An open-source voice type classifier for child-centered daylong recordings. Interspeech. Online open access: https://www.isca-archive.org/interspeech_2020/lavechin20_interspeech.pdf",1,https://github.com/MarvinLvn/voice-type-classifier/tree/e443d8cfc40f7076eea903958d9344d4aa427cc2,2024-04-07,Y,,,,,,,,,,,,,,,,,0
new_its,its,restrictive,automated,ITS,"Xu, D., Yapanel, U., & Gray, S. ( 2009). Reliability of the LENA Language Environment Analysis System in young children’s natural home environment. Boulder, CO: LENA Foundation. Retrieved March 26, 2009, from http://www.lenafoundation.org/Research/TechnicalReports.aspx",1,,2024-05-07,Y,,,,,,,,,,,Y,,,,,,0
eaf_basic,eaf_basic,permissive,manual,,,,,2022-03-17,Y,periodic,,15,3000,all the recordings,all the participants,Valentino Mayer,2,Y,,,Y,N,,,,0
alice/output,vtc,permissive,automated,ALICE,"Räsänen, O., Seshadri, S., Lavechin, M., Cristia, A. & Casillas, M. (in press): ALICE: An open-source tool for automatic linguistic unit count estimation from child-centered daylong recordings. Behavior Research Methods. Online open acccess: https://link.springer.com/article/10.3758/s13428-020-01460-x.",1,https://github.com/LoannPeurey/ALICE/tree/cae98d47a1e16b19bc7452a3984e915839363373,2024-04-07,Y,,,,,,,,,,,Y,,,,made up field to trigger warning,,0
eaf_solis,eaf_basic,permissive,manual,,,,,2023-10-02,Y,high-volubility,,20,60000,all the recordings,all the participants,Marie Littelamb,5,Y,Y,,,,Y,,,0
textgrid2,textgrid2,permissive,manual,,,,,2019-07-16,Y,high-volubility,fem,17,50000,all recordings,1 to 2 yo,Ivan Ciao,5,Y,N,,Y,,,,,0

0 comments on commit e7cba61

Please sign in to comment.