-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add read_allele_frequency to python API (#533)
- Loading branch information
Showing
2 changed files
with
51 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -38,3 +38,4 @@ def _load_libs(): | |
pass | ||
|
||
from .version import version | ||
from .allele_frequency import read_allele_frequency |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
import pandas | ||
|
||
|
||
def _calc_af(df) -> pandas.DataFrame: | ||
""" | ||
Consolidate AC and compute AN, AF | ||
:param pandas.Dataframe df | ||
""" | ||
# Allele Count (AC) = sum of all AC at the same locus | ||
# This step consolidates ACs from all ingested batches | ||
df = df.groupby(["pos", "allele"], sort=True).sum(numeric_only=True) | ||
|
||
# Allele Number (AN) = sum of AC at the same locus | ||
an = df.groupby(["pos"], sort=True).ac.sum().rename("an") | ||
df = df.join(an, how="inner").reset_index() | ||
|
||
# Allele Frequency (AF) = AC / AN | ||
df["af"] = df.ac / df.an | ||
return df | ||
|
||
|
||
def read_allele_frequency(dataset_uri: str, region: str) -> pandas.DataFrame(): | ||
""" | ||
Read variant status | ||
:param dataset_uri: dataset URI | ||
:param region: genomics region to read | ||
""" | ||
import tiledb | ||
|
||
# Get the variant stats uri | ||
with tiledb.Group(dataset_uri) as g: | ||
alleles_uri = g["variant_stats"].uri | ||
|
||
try: | ||
contig = region.split(":")[0] | ||
start, end = map(int, region.split(":")[1].split("-")) | ||
region_slice = slice(start, end) | ||
except Exception as e: | ||
raise ValueError( | ||
f"Invalid region: {region}. Expected format: contig:start-end" | ||
) from e | ||
|
||
with tiledb.open(alleles_uri) as A: | ||
df = A.query(attrs=["ac", "allele"], dims=["pos", "contig"]).df[ | ||
contig, region_slice | ||
] | ||
|
||
return _calc_af(df) |