Skip to content

Commit f3e9dfb

Browse files
authored
Add read_allele_frequency to python API (#533)
1 parent ab3cec7 commit f3e9dfb

File tree

2 files changed

+51
-0
lines changed

2 files changed

+51
-0
lines changed

apis/python/src/tiledbvcf/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,4 @@ def _load_libs():
3838
pass
3939

4040
from .version import version
41+
from .allele_frequency import read_allele_frequency
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import pandas
2+
3+
4+
def _calc_af(df) -> pandas.DataFrame:
5+
"""
6+
Consolidate AC and compute AN, AF
7+
8+
:param pandas.Dataframe df
9+
"""
10+
# Allele Count (AC) = sum of all AC at the same locus
11+
# This step consolidates ACs from all ingested batches
12+
df = df.groupby(["pos", "allele"], sort=True).sum(numeric_only=True)
13+
14+
# Allele Number (AN) = sum of AC at the same locus
15+
an = df.groupby(["pos"], sort=True).ac.sum().rename("an")
16+
df = df.join(an, how="inner").reset_index()
17+
18+
# Allele Frequency (AF) = AC / AN
19+
df["af"] = df.ac / df.an
20+
return df
21+
22+
23+
def read_allele_frequency(dataset_uri: str, region: str) -> pandas.DataFrame():
24+
"""
25+
Read variant status
26+
27+
:param dataset_uri: dataset URI
28+
:param region: genomics region to read
29+
"""
30+
import tiledb
31+
32+
# Get the variant stats uri
33+
with tiledb.Group(dataset_uri) as g:
34+
alleles_uri = g["variant_stats"].uri
35+
36+
try:
37+
contig = region.split(":")[0]
38+
start, end = map(int, region.split(":")[1].split("-"))
39+
region_slice = slice(start, end)
40+
except Exception as e:
41+
raise ValueError(
42+
f"Invalid region: {region}. Expected format: contig:start-end"
43+
) from e
44+
45+
with tiledb.open(alleles_uri) as A:
46+
df = A.query(attrs=["ac", "allele"], dims=["pos", "contig"]).df[
47+
contig, region_slice
48+
]
49+
50+
return _calc_af(df)

0 commit comments

Comments
 (0)