Skip to content

Commit

Permalink
Rename sgkit_sample_id
Browse files Browse the repository at this point in the history
  • Loading branch information
benjeffery authored and mergify[bot] committed Jul 26, 2024
1 parent a803933 commit 738bd13
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 10 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
infer from VCFs via the optimised and parallel VCF parsing in `sgkit`.
- The `variant_mask` boolean array in the `sgkit` dataset can be used mask sites
not wanted for inference.
- `sgkit` `sample_ids` are inserted into individual metadata as `sgkit_sample_id` if
- `sample_ids` are inserted into individual metadata as `variant_data_sample_id` if
this key does not already exist.

**Breaking Changes**
Expand Down
12 changes: 6 additions & 6 deletions tests/test_variantdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def test_sgkit_dataset_roundtrip(tmp_path):

assert ts.num_individuals == inf_ts.num_individuals == ds.dims["samples"]
for ts_ind, sample_id in zip(inf_ts.individuals(), ds["sample_id"].values):
assert ts_ind.metadata["sgkit_sample_id"] == sample_id
assert ts_ind.metadata["variant_data_sample_id"] == sample_id

assert (
ts.num_samples == inf_ts.num_samples == ds.dims["samples"] * ds.dims["ploidy"]
Expand Down Expand Up @@ -112,7 +112,7 @@ def test_sgkit_individual_metadata_not_clobbered(tmp_path):
zarr_root = zarr.open(zarr_path)
empty_obj = json.dumps({}).encode()
indiv_metadata = np.array([empty_obj] * ts.num_individuals, dtype=object)
indiv_metadata[42] = json.dumps({"sgkit_sample_id": "foobar"}).encode()
indiv_metadata[42] = json.dumps({"variant_data_sample_id": "foobar"}).encode()
zarr_root.create_dataset(
"individuals_metadata", data=indiv_metadata, object_codec=numcodecs.VLenBytes()
)
Expand All @@ -129,9 +129,9 @@ def test_sgkit_individual_metadata_not_clobbered(tmp_path):
zip(inf_ts.individuals(), ds["sample_id"].values)
):
if i != 42:
assert ts_ind.metadata["sgkit_sample_id"] == sample_id
assert ts_ind.metadata["variant_data_sample_id"] == sample_id
else:
assert ts_ind.metadata["sgkit_sample_id"] == "foobar"
assert ts_ind.metadata["variant_data_sample_id"] == "foobar"


@pytest.mark.skipif(sys.platform == "win32", reason="No cyvcf2 on windows")
Expand Down Expand Up @@ -186,7 +186,7 @@ def test_sgkit_dataset_accessors(tmp_path):
== ts.tables.individuals.metadata_schema.schema
)
assert samples.individuals_metadata == [
{"sgkit_sample_id": sample_id, **ind.metadata}
{"variant_data_sample_id": sample_id, **ind.metadata}
for ind, sample_id in zip(ts.individuals(), ds["sample_id"].values)
]
assert np.array_equal(
Expand Down Expand Up @@ -234,7 +234,7 @@ def test_sgkit_accessors_defaults(tmp_path):
assert samples.populations_metadata == []
assert samples.individuals_metadata_schema == default_schema
assert samples.individuals_metadata == [
{"sgkit_sample_id": sample_id} for sample_id in ds["sample_id"].values
{"variant_data_sample_id": sample_id} for sample_id in ds["sample_id"].values
]
for time in samples.individuals_time:
assert tskit.is_unknown_time(time)
Expand Down
6 changes: 3 additions & 3 deletions tsinfer/formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -2621,13 +2621,13 @@ def individuals_metadata(self):
self.data["individuals_metadata"][:][self.individuals_select],
):
md = schema.decode_row(r)
if "sgkit_sample_id" not in md:
md["sgkit_sample_id"] = sample_id
if "variant_data_sample_id" not in md:
md["variant_data_sample_id"] = sample_id
md_list.append(md)
return md_list
else:
return [
{"sgkit_sample_id": sample_id}
{"variant_data_sample_id": sample_id}
for sample_id in self.data["sample_id"][:][self.individuals_select]
]

Expand Down

0 comments on commit 738bd13

Please sign in to comment.