Skip to content

Commit 1c355ae

Browse files
benjefferymergify[bot]
authored andcommitted
Fix wrong schema used for individual metadata
1 parent 1b3d035 commit 1c355ae

File tree

3 files changed

+35
-17
lines changed

3 files changed

+35
-17
lines changed

tests/test_variantdata.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ def test_sgkit_dataset_accessors(tmp_path):
170170
assert np.array_equal(
171171
samples.samples_individual, np.repeat(np.arange(ts.num_samples // 3), 3)
172172
)
173-
assert samples.metadata_schema == tsutil.EXAMPLE_SCHEMA.schema
173+
assert samples.metadata_schema == tsutil.example_schema("example").schema
174174
assert samples.metadata == ts.tables.metadata
175175
assert (
176176
samples.populations_metadata_schema

tests/tsutil.py

+32-14
Original file line numberDiff line numberDiff line change
@@ -198,9 +198,13 @@ def get_example_historical_sampled_ts(
198198
return tables.tree_sequence()
199199

200200

201-
EXAMPLE_SCHEMA = tskit.MetadataSchema(
202-
{"codec": "json", "properties": {"foo": {"type": "integer"}}}
203-
)
201+
def example_schema(default):
202+
return tskit.MetadataSchema(
203+
{
204+
"codec": "json",
205+
"properties": {"default_prop": {"type": "string", "default": default}},
206+
}
207+
)
204208

205209

206210
def add_array_to_dataset(name, array, zarr_path, dims=None):
@@ -227,22 +231,23 @@ def make_ts_and_zarr(path, add_optional=False, shuffle_alleles=True):
227231
)
228232
ts = msprime.sim_mutations(ts, rate=0.025, model=msprime.JC69(), random_seed=42)
229233
tables = ts.dump_tables()
230-
tables.metadata_schema = EXAMPLE_SCHEMA
234+
tables.metadata_schema = example_schema("example")
235+
tables.metadata = {"foo": "bar"}
231236
sites_copy = tables.sites.copy()
232237
tables.sites.clear()
233-
tables.sites.metadata_schema = EXAMPLE_SCHEMA
238+
tables.sites.metadata_schema = example_schema("sites")
234239
for i, site in enumerate(sites_copy):
235240
tables.sites.append(site.replace(metadata={"id_site": i}))
236241

237242
pops_copy = tables.populations.copy()
238243
tables.populations.clear()
239-
tables.populations.metadata_schema = EXAMPLE_SCHEMA
244+
tables.populations.metadata_schema = example_schema("populations")
240245
for i, pop in enumerate(pops_copy):
241246
tables.populations.append(pop.replace(metadata={"id_pop": i}))
242247

243248
indiv_copy = tables.individuals.copy()
244249
tables.individuals.clear()
245-
tables.individuals.metadata_schema = EXAMPLE_SCHEMA
250+
tables.individuals.metadata_schema = example_schema("individuals")
246251
for i, ind in enumerate(indiv_copy):
247252
tables.individuals.append(ind.replace(metadata={"id_indiv": i}))
248253

@@ -320,12 +325,14 @@ def make_ts_and_zarr(path, add_optional=False, shuffle_alleles=True):
320325
ts.sequence_length + 1337,
321326
path / "data.zarr",
322327
)
328+
sites_md = tables.sites.metadata
329+
sites_md_offset = tables.sites.metadata_offset
323330
add_array_to_dataset(
324331
"sites_metadata",
325332
np.array(
326333
[
327-
tables.sites.metadata_schema.encode_row(site.metadata)
328-
for site in ts.sites()
334+
sites_md[sites_md_offset[i] : sites_md_offset[i + 1]].tobytes()
335+
for i in range(ts.num_sites)
329336
]
330337
),
331338
path / "data.zarr",
@@ -347,6 +354,11 @@ def make_ts_and_zarr(path, add_optional=False, shuffle_alleles=True):
347354
repr(tables.metadata_schema),
348355
path / "data.zarr",
349356
)
357+
add_attribute_to_dataset(
358+
"metadata",
359+
tables.metadata_bytes.decode(),
360+
path / "data.zarr",
361+
)
350362
add_array_to_dataset(
351363
"provenances_timestamp",
352364
["2021-01-01T00:00:00", "2021-01-02T00:00:00"],
@@ -364,12 +376,16 @@ def make_ts_and_zarr(path, add_optional=False, shuffle_alleles=True):
364376
repr(tables.populations.metadata_schema),
365377
path / "data.zarr",
366378
)
379+
populations_md = tables.populations.metadata
380+
populations_md_offset = tables.populations.metadata_offset
367381
add_array_to_dataset(
368382
"populations_metadata",
369383
np.array(
370384
[
371-
tables.populations.metadata_schema.encode_row(population.metadata)
372-
for population in ts.populations()
385+
populations_md[
386+
populations_md_offset[i] : populations_md_offset[i + 1]
387+
].tobytes()
388+
for i in range(ts.num_populations)
373389
]
374390
),
375391
path / "data.zarr",
@@ -381,13 +397,15 @@ def make_ts_and_zarr(path, add_optional=False, shuffle_alleles=True):
381397
path / "data.zarr",
382398
["samples"],
383399
)
400+
indiv_md = tables.individuals.metadata
401+
indiv_md_offset = tables.individuals.metadata_offset
384402
add_array_to_dataset(
385403
"individuals_metadata",
386404
np.array(
387405
[
388-
tables.individuals.metadata_schema.encode_row(individual.metadata)
389-
for individual in ts.individuals()
390-
]
406+
indiv_md[indiv_md_offset[i] : indiv_md_offset[i + 1]].tobytes()
407+
for i in range(ts.num_individuals)
408+
],
391409
),
392410
path / "data.zarr",
393411
["samples"],

tsinfer/formats.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -2555,7 +2555,7 @@ def metadata_schema(self):
25552555
def metadata(self):
25562556
try:
25572557
return tskit.MetadataSchema(self.metadata_schema).decode_row(
2558-
self.data.attrs["metadata"]
2558+
self.data.attrs["metadata"].encode()
25592559
)
25602560
except KeyError:
25612561
return {}
@@ -2606,7 +2606,7 @@ def individuals_metadata_schema(self):
26062606

26072607
@functools.cached_property
26082608
def individuals_metadata(self):
2609-
schema = tskit.MetadataSchema(self.populations_metadata_schema)
2609+
schema = tskit.MetadataSchema(self.individuals_metadata_schema)
26102610
# We set the sample_id in the individual metadata as this is often useful,
26112611
# however we silently don't overwrite if the key exists
26122612
if "individuals_metadata" in self.data:

0 commit comments

Comments
 (0)