Skip to content

Commit

Permalink
Add read_allele_frequency to python API (#533)
Browse files Browse the repository at this point in the history
  • Loading branch information
gspowley authored Jun 8, 2023
1 parent ab3cec7 commit f3e9dfb
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 0 deletions.
1 change: 1 addition & 0 deletions apis/python/src/tiledbvcf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,4 @@ def _load_libs():
pass

from .version import version
from .allele_frequency import read_allele_frequency
50 changes: 50 additions & 0 deletions apis/python/src/tiledbvcf/allele_frequency.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import pandas


def _calc_af(df) -> pandas.DataFrame:
"""
Consolidate AC and compute AN, AF
:param pandas.Dataframe df
"""
# Allele Count (AC) = sum of all AC at the same locus
# This step consolidates ACs from all ingested batches
df = df.groupby(["pos", "allele"], sort=True).sum(numeric_only=True)

# Allele Number (AN) = sum of AC at the same locus
an = df.groupby(["pos"], sort=True).ac.sum().rename("an")
df = df.join(an, how="inner").reset_index()

# Allele Frequency (AF) = AC / AN
df["af"] = df.ac / df.an
return df


def read_allele_frequency(dataset_uri: str, region: str) -> pandas.DataFrame():
"""
Read variant status
:param dataset_uri: dataset URI
:param region: genomics region to read
"""
import tiledb

# Get the variant stats uri
with tiledb.Group(dataset_uri) as g:
alleles_uri = g["variant_stats"].uri

try:
contig = region.split(":")[0]
start, end = map(int, region.split(":")[1].split("-"))
region_slice = slice(start, end)
except Exception as e:
raise ValueError(
f"Invalid region: {region}. Expected format: contig:start-end"
) from e

with tiledb.open(alleles_uri) as A:
df = A.query(attrs=["ac", "allele"], dims=["pos", "contig"]).df[
contig, region_slice
]

return _calc_af(df)

0 comments on commit f3e9dfb

Please sign in to comment.