diff --git a/tests/test_sgkit.py b/tests/test_sgkit.py index 0940e8b6..b930263a 100644 --- a/tests/test_sgkit.py +++ b/tests/test_sgkit.py @@ -527,6 +527,7 @@ def test_ancestral_missingness(tmp_path): ancestral_allele = ds.variant_ancestral_allele.values ancestral_allele[0] = "N" ancestral_allele[11] = "-" + ancestral_allele[12] = "💩" ancestral_allele[15] = "💩" ds = ds.drop_vars(["variant_ancestral_allele"]) sgkit.save_dataset(ds, str(zarr_path) + ".tmp") @@ -538,19 +539,16 @@ def test_ancestral_missingness(tmp_path): ) ds = sgkit.load_dataset(str(zarr_path) + ".tmp") sd = tsinfer.SgkitSampleData(str(zarr_path) + ".tmp") - with pytest.warns(UserWarning, match="The following alleles were not found"): + with pytest.warns( + UserWarning, + match=r"not found in the variant_allele array for the 4 [\s\S]*'💩': 2", + ): inf_ts = tsinfer.infer(sd) - for i, ( - inf_var, - var, - ) in enumerate(zip(inf_ts.variants(), ts.variants())): - assert inf_var.site.ancestral_state == var.site.ancestral_state or i in [ - 0, - 11, - 15, - ] - if i in [0, 11, 15]: + for i, (inf_var, var) in enumerate(zip(inf_ts.variants(), ts.variants())): + if i in [0, 11, 12, 15]: assert inf_var.site.metadata == {"inference_type": "parsimony"} + else: + assert inf_var.site.ancestral_state == var.site.ancestral_state @pytest.mark.skipif(sys.platform == "win32", reason="File permission errors on Windows") diff --git a/tsinfer/formats.py b/tsinfer/formats.py index ee2125a9..5ae575b7 100644 --- a/tsinfer/formats.py +++ b/tsinfer/formats.py @@ -2405,11 +2405,20 @@ def sites_ancestral_allele(self): except IndexError: unknown_alleles[allele] += 1 ret[i] = allele_index - if sum(unknown_alleles.values()) > 0: + tot = sum(unknown_alleles.values()) + if tot > 0: + num_sites = len(string_allele) + frac_bad = tot / num_sites + frac_bad_per_type = [v / num_sites for v in unknown_alleles.values()] + summarise_unknown = [ + f"'{k}': {v} ({frac * 100:.2f}% of sites)" # Summarise per allele type + for (k, v), frac in zip(unknown_alleles.items(), frac_bad_per_type) + ] warnings.warn( - "The following alleles were not found in the variant_allele array " - "and will be treated as unknown:\n" - f"{unknown_alleles}" + "An ancestral allele was not found in the variant_allele array for " + + f"the {tot} sites ({frac_bad * 100 :.2f}%) listed below. " + + "They will be treated as of unknown ancestral state:\n " + + "\n ".join(summarise_unknown) ) return ret