Don't clobber existing

benjeffery · mergify[bot] · commit 93e386e5b544 · 2023-11-15T15:06:32.000Z
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,15 @@
 
 In development
 
+**Features**
+
+- `tsinfer` now supports inferring data from an `sgkit` dataset. This allows users to
+  infer from VCFs via the optimised and parallel VCF parsing in `sgkit`.
+  - The `variant_mask` boolean array in the `sgkit` dataset can be used mask sites
+    not wanted for inference.
+  - `sgkit` `sample_ids` are inserted into individual metadata as `sgkit_sample_id` if
+    this key does not already exist.
+
 **Breaking Changes**
 
 - Remove the `uuid` field from SampleData. SampleData equality is now purely based
diff --git a/tests/test_sgkit.py b/tests/test_sgkit.py
@@ -19,15 +19,18 @@
 """
 Tests for the data files.
 """
+import json
 import sys
 import tempfile
 
 import msprime
+import numcodecs
 import numpy as np
 import pytest
 import sgkit
 import tskit
 import xarray as xr
+import zarr
 
 import tsinfer
 from tsinfer import formats
@@ -262,8 +265,8 @@ def test_sgkit_dataset_roundtrip(tmp_path):
     ds = sgkit.load_dataset(zarr_path)
 
     assert ts.num_individuals == inf_ts.num_individuals == ds.dims["samples"]
-    for (i, ind) in zip(inf_ts.individuals(), ds["sample_id"].values):
-        assert i.metadata["sgkit_sample_id"] == ind
+    for ts_ind, sample_id in zip(inf_ts.individuals(), ds["sample_id"].values):
+        assert ts_ind.metadata["sgkit_sample_id"] == sample_id
 
     assert (
         ts.num_samples == inf_ts.num_samples == ds.dims["samples"] * ds.dims["ploidy"]
@@ -284,6 +287,35 @@ def test_sgkit_dataset_roundtrip(tmp_path):
     assert inf_ts.num_edges > 200
 
 
+@pytest.mark.skipif(sys.platform == "win32", reason="No cyvcf2 on windows")
+def test_sgkit_individual_metadata_not_clobbered(tmp_path):
+    ts, zarr_path = make_ts_and_zarr(tmp_path)
+    # Load the zarr to add metadata for testing
+    zarr_root = zarr.open(zarr_path)
+    empty_obj = json.dumps({}).encode()
+    indiv_metadata = np.array([empty_obj] * ts.num_individuals, dtype=object)
+    indiv_metadata[42] = json.dumps({"sgkit_sample_id": "foobar"}).encode()
+    zarr_root.create_dataset(
+        "individuals_metadata", data=indiv_metadata, object_codec=numcodecs.VLenBytes()
+    )
+    zarr_root.attrs["individuals_metadata_schema"] = repr(
+        tskit.MetadataSchema.permissive_json()
+    )
+
+    samples = tsinfer.SgkitSampleData(zarr_path)
+    inf_ts = tsinfer.infer(samples)
+    ds = sgkit.load_dataset(zarr_path)
+
+    assert ts.num_individuals == inf_ts.num_individuals == ds.dims["samples"]
+    for i, (ts_ind, sample_id) in enumerate(
+        zip(inf_ts.individuals(), ds["sample_id"].values)
+    ):
+        if i != 42:
+            assert ts_ind.metadata["sgkit_sample_id"] == sample_id
+        else:
+            assert ts_ind.metadata["sgkit_sample_id"] == "foobar"
+
+
 @pytest.mark.skipif(sys.platform == "win32", reason="No cyvcf2 on windows")
 def test_sgkit_dataset_accessors(tmp_path):
     ts, zarr_path = make_ts_and_zarr(tmp_path, add_optional=True, shuffle_alleles=False)
diff --git a/tsinfer/formats.py b/tsinfer/formats.py
@@ -2512,6 +2512,8 @@ def individuals_metadata_schema(self):
     @functools.cached_property
     def individuals_metadata(self):
         schema = tskit.MetadataSchema(self.populations_metadata_schema)
+        # We set the sample_id in the individual metadata as this is often useful,
+        # however we silently don't overwrite if the key exists
         if "individuals_metadata" in self.data:
             assert len(self.data["individuals_metadata"]) == self.num_individuals
             assert self.num_individuals == len(self.data["sample_id"])
@@ -2520,7 +2522,8 @@ def individuals_metadata(self):
                 self.data["sample_id"], self.data["individuals_metadata"][:]
             ):
                 md = schema.decode_row(r)
-                md["sgkit_sample_id"] = sample_id
+                if "sgkit_sample_id" not in md:
+                    md["sgkit_sample_id"] = sample_id
                 md_list.append(md)
             return md_list
         else: