diff --git a/examples/valid_raw_data/annotations/alice/output/metannots.yml b/examples/valid_raw_data/annotations/alice/output/metannots.yml index b2d91aa93..7b9f69a32 100644 --- a/examples/valid_raw_data/annotations/alice/output/metannots.yml +++ b/examples/valid_raw_data/annotations/alice/output/metannots.yml @@ -8,3 +8,4 @@ annotation_algorithm_repo: 'https://github.com/LoannPeurey/ALICE/tree/cae98d47a1 date_annotation: '2024-04-07' has_speaker_type: 'Y' has_words: 'Y' +invented: "made up field to trigger warning" diff --git a/examples/valid_raw_data/annotations/old_its/metannots.yml b/examples/valid_raw_data/annotations/old_its/metannots.yml deleted file mode 100644 index f3d757aab..000000000 --- a/examples/valid_raw_data/annotations/old_its/metannots.yml +++ /dev/null @@ -1,9 +0,0 @@ -segmentation: 'its' -segmentation_type: 'restrictive' -method: 'automated' -annotation_algorithm_name: 'ITS' -annotation_algorithm_publication: 'Xu, D., Yapanel, U., & Gray, S. ( 2009). Reliability of the LENA Language Environment Analysis System in young children’s natural home environment. Boulder, CO: LENA Foundation. Retrieved March 26, 2009, from http://www.lenafoundation.org/Research/TechnicalReports.aspx' -annotation_algorithm_version: '1' -date_annotation: '2024-05-07' -has_speaker_type: 'Y' -has_words: 'Y' diff --git a/examples/valid_raw_data/annotations/old_its/metannots.yml b/examples/valid_raw_data/annotations/old_its/metannots.yml new file mode 120000 index 000000000..7080072d7 --- /dev/null +++ b/examples/valid_raw_data/annotations/old_its/metannots.yml @@ -0,0 +1 @@ +doesntexist \ No newline at end of file diff --git a/examples/valid_raw_data/annotations/textgrid/metannots.yml b/examples/valid_raw_data/annotations/textgrid/metannots.yml index fbdbe7852..70a0378e5 100644 --- a/examples/valid_raw_data/annotations/textgrid/metannots.yml +++ b/examples/valid_raw_data/annotations/textgrid/metannots.yml @@ -13,3 +13,4 @@ date_annotation: '2019-04-05' has_speaker_type: 'Y' has_vcm_type: 'Y' has_addressee: 'Y' +random_field: 34 \ No newline at end of file diff --git a/examples/valid_raw_data/annotations/vtc_rttm/metannots.yml b/examples/valid_raw_data/annotations/vtc_rttm/metannots.yml deleted file mode 100644 index 9d9de7fed..000000000 --- a/examples/valid_raw_data/annotations/vtc_rttm/metannots.yml +++ /dev/null @@ -1,9 +0,0 @@ -segmentation: 'vtc' -segmentation_type: 'permissive' -method: 'automated' -annotation_algorithm_name: 'VTC' -annotation_algorithm_publication: 'Lavechin, M., Bousbib, R., Bredin, H., Dupoux, E., & Cristia, A. (2020). An open-source voice type classifier for child-centered daylong recordings. Interspeech. Online open access: https://www.isca-archive.org/interspeech_2020/lavechin20_interspeech.pdf' -annotation_algorithm_version: '1' -annotation_algorithm_repo: 'https://github.com/MarvinLvn/voice-type-classifier/tree/e443d8cfc40f7076eea903958d9344d4aa427cc2' -date_annotation: '2024-04-14' -has_speaker_type: 'Y' diff --git a/tests/test_annotations.py b/tests/test_annotations.py index e3b7bf2ca..82733bf93 100644 --- a/tests/test_annotations.py +++ b/tests/test_annotations.py @@ -34,7 +34,7 @@ def project(request): if os.path.exists(PATH): # shutil.copytree(src="examples/valid_raw_data", dst="output/annotations") shutil.rmtree(PATH) - shutil.copytree(src="examples/valid_raw_data", dst=PATH) + shutil.copytree(src="examples/valid_raw_data", dst=PATH, symlinks=True) project = ChildProject(PATH) @@ -196,7 +196,9 @@ def test_import(project, am): assert len(errors) == 0 and len(warnings) == 0, "malformed annotations detected" errors, warnings = am.read() - assert len(errors) == 0 and len(warnings) == 0, "malformed annotation indexes detected" + assert (len(errors) == 0 and + warnings == ["Metadata files for sets ['vtc_rttm'] could not be found, they should be created as annotations//metannots.yml", "Metadata file content for sets ['old_its'] could not be found, it may be downloaded from a remote with the command `datalad get annotations/**/metannots.yml`", "Metadata files for sets contain the following unknown fields ['invented', 'random_field'] which can be found in the metadata for sets ['alice/output', 'textgrid']"] + ), "malformed annotation indexes detected" for dataset in ["eaf_basic", "textgrid", "eaf_solis"]: annotations = am.annotations[am.annotations["set"] == dataset] @@ -290,7 +292,9 @@ def test_multiple_imports(project, am, input_file, ow, rimported, rerrors, excep errors, warnings = am.read() print(errors) print(warnings) - assert len(errors) == 0 and len(warnings) == 0, "malformed annotation indexes detected" + assert (len(errors) == 0 and + warnings == ["Metadata files for sets ['vtc_rttm'] could not be found, they should be created as annotations//metannots.yml", "Metadata file content for sets ['old_its'] could not be found, it may be downloaded from a remote with the command `datalad get annotations/**/metannots.yml`", "Metadata files for sets contain the following unknown fields ['invented', 'random_field'] which can be found in the metadata for sets ['alice/output', 'textgrid']"] + ), "malformed annotation indexes detected" def test_import_incorrect_data_types(project, am): @@ -765,35 +769,47 @@ def test_set_from_path(project, am): == "set/subset" ) -sets_metadata_default = pd.DataFrame() -@pytest.mark.parametrize("metadata_exists,warning,return_value,error", - [(True, 'not set', None, ValueError), - (True, 'ignore', sets_metadata_default, None), - (True, 'return', (sets_metadata_default, []), None), - (True, 'log', sets_metadata_default, None), - (False, 'not set', None, ValueError), - (False, 'ignore', sets_metadata_default, None), - (False, 'return', (sets_metadata_default, []), None), - (False, 'log', sets_metadata_default, None), +# TODO : Add testing for all the kinds of warnings? +@pytest.mark.parametrize("metadata_exists,warning,truth_path,warnings,log", + [(True, 'ignore', TRUTH / 'sets_metadata.csv', None, []), + (True, 'return', TRUTH / 'sets_metadata.csv', ["Metadata files for sets ['vtc_rttm'] could not be found, they should be created as annotations//metannots.yml", "Metadata file content for sets ['old_its'] could not be found, it may be downloaded from a remote with the command `datalad get annotations/**/metannots.yml`", "Metadata files for sets contain the following unknown fields ['random_field', 'invented'] which can be found in the metadata for sets ['textgrid', 'alice/output']"], []), + (True, 'log', TRUTH / 'sets_metadata.csv', None, [('ChildProject.annotations', 30, "Metadata files for sets ['vtc_rttm'] could not be found, they should be created as annotations//metannots.yml"), ('ChildProject.annotations', 30, "Metadata file content for sets ['old_its'] could not be found, it may be downloaded from a remote with the command `datalad get annotations/**/metannots.yml`"), ('ChildProject.annotations', 30, "Metadata files for sets contain the following unknown fields ['random_field', 'invented'] which can be found in the metadata for sets ['textgrid', 'alice/output']")]), + (False, 'ignore', TRUTH / 'sets_empty_metadata.csv', None, []), + (False, 'return', TRUTH / 'sets_empty_metadata.csv', ["Metadata files for sets ['vtc_rttm', 'textgrid', 'metrics', 'vtc_present', 'new_its', 'eaf_basic', 'alice/output', 'eaf_solis', 'textgrid2'] could not be found, they should be created as annotations//metannots.yml", "Metadata file content for sets ['old_its'] could not be found, it may be downloaded from a remote with the command `datalad get annotations/**/metannots.yml`"], []), + (False, 'log', TRUTH / 'sets_empty_metadata.csv', None, [('ChildProject.annotations', 30, "Metadata files for sets ['vtc_rttm', 'textgrid', 'metrics', 'vtc_present', 'new_its', 'eaf_basic', 'alice/output', 'eaf_solis', 'textgrid2'] could not be found, they should be created as annotations//metannots.yml"), ('ChildProject.annotations', 30, "Metadata file content for sets ['old_its'] could not be found, it may be downloaded from a remote with the command `datalad get annotations/**/metannots.yml`")]), ]) -def test_read_sets_metadata(project, am, metadata_exists, warning, return_value, error): +def test_read_sets_metadata(project, am, caplog, metadata_exists, warning, truth_path, warnings, log): + # rather than importing the annotation sets (which relies on having the importation work correctly + # just create a fake annotation record that can be used to load metadata + sets = [n if n != 'alice' else 'alice/output' for n in os.listdir(project.path / ANNOTATIONS) if + os.path.isdir(project.path / ANNOTATIONS / n)] + zeros = [0 for i in range(len(sets))] + fields = ['' for i in range(len(sets))] + + am.annotations = pd.DataFrame({'set': sets, 'range_onset': zeros, 'range_offset': zeros, + 'annotation_filename': fields, 'raw_filename': fields}) + if not metadata_exists: for set in am.annotations['set'].unique(): - if os.path.exists(project.path / set / METANNOTS): - os.remove(project.path / set / METANNOTS) - if error is not None: - with pytest.raises(error): - am._read_sets_metadata(warning) - else: - result = am._read_sets_metadata(warning) - if type(return_value) == tuple: - print(result[0]) - assert return_value[1] == result[1] - pd.testing.assert_frame_equal(return_value[0], result[0]) - else: - print(result) - pd.testing.assert_frame_equal(return_value, result) + if os.path.exists(project.path / ANNOTATIONS / set / METANNOTS): + os.remove(project.path / ANNOTATIONS / set / METANNOTS) + dtypes = {f.name: f.dtype if f.dtype is not None else 'string' for f in AnnotationManager.SETS_COLUMNS} + truth_df = pd.read_csv(truth_path, index_col='set', dtype=dtypes).drop(columns='duration') + return_value = (truth_df, warnings) if warnings is not None else truth_df + + result = am._read_sets_metadata(warning) + + capt_log = caplog.record_tuples + # assert capt_stdout == stdout + assert capt_log == log + + if type(return_value) == tuple: + assert result[1] == return_value[1] + pd.testing.assert_frame_equal(return_value[0], result[0], check_like=True, check_dtype=False) + else: + print(result) + pd.testing.assert_frame_equal(return_value, result, check_like=True, check_dtype=False) @pytest.mark.parametrize("metadata_exists,return_value", [(True, TRUTH / 'sets_metadata.csv'), @@ -818,7 +834,7 @@ def test_get_sets_metadata(project, am, metadata_exists, return_value): result = am.get_sets_metadata() # result.to_csv(return_value, index_label='set') dtypes = {f.name: f.dtype if f.dtype is not None else 'string' for f in AnnotationManager.SETS_COLUMNS} - pd.testing.assert_frame_equal(pd.read_csv(return_value, index_col='set', dtype=dtypes), result, check_like=True) + pd.testing.assert_frame_equal(pd.read_csv(return_value, index_col='set', dtype=dtypes), result, check_like=True, check_dtype=False) # its diff --git a/tests/truth/sets_metadata.csv b/tests/truth/sets_metadata.csv index d7bc7be40..8d8e2fc7e 100644 --- a/tests/truth/sets_metadata.csv +++ b/tests/truth/sets_metadata.csv @@ -1,11 +1,11 @@ -set,segmentation,segmentation_type,method,annotation_algorithm_name,annotation_algorithm_publication,annotation_algorithm_version,annotation_algorithm_repo,date_annotation,has_speaker_type,sampling_method,sampling_target,sampling_count,sampling_unit_duration,recording_selection,participant_selection,annotator_name,annotator_experience,has_vcm_type,has_addressee,has_words,has_transcription,has_interactions,has_acoustics,duration -vtc_rttm,vtc,permissive,automated,VTC,"Lavechin, M., Bousbib, R., Bredin, H., Dupoux, E., & Cristia, A. (2020). An open-source voice type classifier for child-centered daylong recordings. Interspeech. Online open access: https://www.isca-archive.org/interspeech_2020/lavechin20_interspeech.pdf",1,https://github.com/MarvinLvn/voice-type-classifier/tree/e443d8cfc40f7076eea903958d9344d4aa427cc2,2024-04-14,Y,,,,,,,,,,,,,,,0 -textgrid,textgrid,permissive,manual,,,,,2019-04-05,Y,random,chi,10,120000,daytime recordings,have siblings,Gabin Fournier,5,Y,Y,,,,,0 -old_its,its,restrictive,automated,ITS,"Xu, D., Yapanel, U., & Gray, S. ( 2009). Reliability of the LENA Language Environment Analysis System in young children’s natural home environment. Boulder, CO: LENA Foundation. Retrieved March 26, 2009, from http://www.lenafoundation.org/Research/TechnicalReports.aspx",1,,2024-05-07,Y,,,,,,,,,,,Y,,,,0 -metrics,vtc,permissive,derivation,,,,,2024-06-23,Y,,,,,,,,,,,,,,,0 -vtc_present,vtc,permissive,automated,VTC,"Lavechin, M., Bousbib, R., Bredin, H., Dupoux, E., & Cristia, A. (2020). An open-source voice type classifier for child-centered daylong recordings. Interspeech. Online open access: https://www.isca-archive.org/interspeech_2020/lavechin20_interspeech.pdf",1,https://github.com/MarvinLvn/voice-type-classifier/tree/e443d8cfc40f7076eea903958d9344d4aa427cc2,2024-04-07,Y,,,,,,,,,,,,,,,0 -new_its,its,restrictive,automated,ITS,"Xu, D., Yapanel, U., & Gray, S. ( 2009). Reliability of the LENA Language Environment Analysis System in young children’s natural home environment. Boulder, CO: LENA Foundation. Retrieved March 26, 2009, from http://www.lenafoundation.org/Research/TechnicalReports.aspx",1,,2024-05-07,Y,,,,,,,,,,,Y,,,,0 -eaf_basic,eaf_basic,permissive,manual,,,,,2022-03-17,Y,periodic,,15,3000,all the recordings,all the participants,Valentino Mayer,2,Y,,,Y,N,,0 -alice/output,vtc,permissive,automated,ALICE,"Räsänen, O., Seshadri, S., Lavechin, M., Cristia, A. & Casillas, M. (in press): ALICE: An open-source tool for automatic linguistic unit count estimation from child-centered daylong recordings. Behavior Research Methods. Online open acccess: https://link.springer.com/article/10.3758/s13428-020-01460-x.",1,https://github.com/LoannPeurey/ALICE/tree/cae98d47a1e16b19bc7452a3984e915839363373,2024-04-07,Y,,,,,,,,,,,Y,,,,0 -eaf_solis,eaf_basic,permissive,manual,,,,,2023-10-02,Y,high-volubility,,20,60000,all the recordings,all the participants,Marie Littelamb,5,Y,Y,,,,Y,0 -textgrid2,textgrid2,permissive,manual,,,,,2019-07-16,Y,high-volubility,fem,17,50000,all recordings,1 to 2 yo,Ivan Ciao,5,Y,N,,Y,,,0 +set,segmentation,segmentation_type,method,annotation_algorithm_name,annotation_algorithm_publication,annotation_algorithm_version,annotation_algorithm_repo,date_annotation,has_speaker_type,sampling_method,sampling_target,sampling_count,sampling_unit_duration,recording_selection,participant_selection,annotator_name,annotator_experience,has_vcm_type,has_addressee,has_words,has_transcription,has_interactions,has_acoustics,invented,random_field,duration +vtc_rttm,vtc,permissive,automated,VTC,"Lavechin, M., Bousbib, R., Bredin, H., Dupoux, E., & Cristia, A. (2020). An open-source voice type classifier for child-centered daylong recordings. Interspeech. Online open access: https://www.isca-archive.org/interspeech_2020/lavechin20_interspeech.pdf",1,https://github.com/MarvinLvn/voice-type-classifier/tree/e443d8cfc40f7076eea903958d9344d4aa427cc2,2024-04-14,Y,,,,,,,,,,,,,,,,,0 +textgrid,textgrid,permissive,manual,,,,,2019-04-05,Y,random,chi,10,120000,daytime recordings,have siblings,Gabin Fournier,5,Y,Y,,,,,,34,0 +old_its,its,restrictive,automated,ITS,"Xu, D., Yapanel, U., & Gray, S. ( 2009). Reliability of the LENA Language Environment Analysis System in young children’s natural home environment. Boulder, CO: LENA Foundation. Retrieved March 26, 2009, from http://www.lenafoundation.org/Research/TechnicalReports.aspx",1,,2024-05-07,Y,,,,,,,,,,,Y,,,,,,0 +metrics,vtc,permissive,derivation,,,,,2024-06-23,Y,,,,,,,,,,,,,,,,,0 +vtc_present,vtc,permissive,automated,VTC,"Lavechin, M., Bousbib, R., Bredin, H., Dupoux, E., & Cristia, A. (2020). An open-source voice type classifier for child-centered daylong recordings. Interspeech. Online open access: https://www.isca-archive.org/interspeech_2020/lavechin20_interspeech.pdf",1,https://github.com/MarvinLvn/voice-type-classifier/tree/e443d8cfc40f7076eea903958d9344d4aa427cc2,2024-04-07,Y,,,,,,,,,,,,,,,,,0 +new_its,its,restrictive,automated,ITS,"Xu, D., Yapanel, U., & Gray, S. ( 2009). Reliability of the LENA Language Environment Analysis System in young children’s natural home environment. Boulder, CO: LENA Foundation. Retrieved March 26, 2009, from http://www.lenafoundation.org/Research/TechnicalReports.aspx",1,,2024-05-07,Y,,,,,,,,,,,Y,,,,,,0 +eaf_basic,eaf_basic,permissive,manual,,,,,2022-03-17,Y,periodic,,15,3000,all the recordings,all the participants,Valentino Mayer,2,Y,,,Y,N,,,,0 +alice/output,vtc,permissive,automated,ALICE,"Räsänen, O., Seshadri, S., Lavechin, M., Cristia, A. & Casillas, M. (in press): ALICE: An open-source tool for automatic linguistic unit count estimation from child-centered daylong recordings. Behavior Research Methods. Online open acccess: https://link.springer.com/article/10.3758/s13428-020-01460-x.",1,https://github.com/LoannPeurey/ALICE/tree/cae98d47a1e16b19bc7452a3984e915839363373,2024-04-07,Y,,,,,,,,,,,Y,,,,made up field to trigger warning,,0 +eaf_solis,eaf_basic,permissive,manual,,,,,2023-10-02,Y,high-volubility,,20,60000,all the recordings,all the participants,Marie Littelamb,5,Y,Y,,,,Y,,,0 +textgrid2,textgrid2,permissive,manual,,,,,2019-07-16,Y,high-volubility,fem,17,50000,all recordings,1 to 2 yo,Ivan Ciao,5,Y,N,,Y,,,,,0