From af170e1abc15766b3f81da57c7b9b8895bf86575 Mon Sep 17 00:00:00 2001 From: Yan Wong Date: Fri, 19 Jan 2024 22:57:53 +0000 Subject: [PATCH] Better formatted error message for ancestral alleles --- tests/test_sgkit.py | 5 ++++- tsinfer/formats.py | 13 +++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/tests/test_sgkit.py b/tests/test_sgkit.py index 0940e8b6..b829057a 100644 --- a/tests/test_sgkit.py +++ b/tests/test_sgkit.py @@ -538,7 +538,10 @@ def test_ancestral_missingness(tmp_path): ) ds = sgkit.load_dataset(str(zarr_path) + ".tmp") sd = tsinfer.SgkitSampleData(str(zarr_path) + ".tmp") - with pytest.warns(UserWarning, match="The following alleles were not found"): + with pytest.warns( + UserWarning, + match="Ancestral alleles not found in the variant_allele array for 3 sites", + ): inf_ts = tsinfer.infer(sd) for i, ( inf_var, diff --git a/tsinfer/formats.py b/tsinfer/formats.py index ee2125a9..53614338 100644 --- a/tsinfer/formats.py +++ b/tsinfer/formats.py @@ -2405,11 +2405,16 @@ def sites_ancestral_allele(self): except IndexError: unknown_alleles[allele] += 1 ret[i] = allele_index - if sum(unknown_alleles.values()) > 0: + tot = sum(unknown_alleles.values()) + if tot > 0: warnings.warn( - "The following alleles were not found in the variant_allele array " - "and will be treated as unknown:\n" - f"{unknown_alleles}" + "Ancestral alleles not found in the variant_allele " + f"array for the {tot} sites ({tot/len(string_allele)*100:.2f}%) " + "listed below. They will be treated as of unknown ancestral state:\n " + + "\n ".join( + f"'{k}': {v} ({v/len(string_allele)*100:.2f}% of sites)" + for k, v in unknown_alleles.items() + ) ) return ret