|
| 1 | +import pandas |
| 2 | + |
| 3 | + |
| 4 | +def _calc_af(df) -> pandas.DataFrame: |
| 5 | + """ |
| 6 | + Consolidate AC and compute AN, AF |
| 7 | +
|
| 8 | + :param pandas.Dataframe df |
| 9 | + """ |
| 10 | + # Allele Count (AC) = sum of all AC at the same locus |
| 11 | + # This step consolidates ACs from all ingested batches |
| 12 | + df = df.groupby(["pos", "allele"], sort=True).sum(numeric_only=True) |
| 13 | + |
| 14 | + # Allele Number (AN) = sum of AC at the same locus |
| 15 | + an = df.groupby(["pos"], sort=True).ac.sum().rename("an") |
| 16 | + df = df.join(an, how="inner").reset_index() |
| 17 | + |
| 18 | + # Allele Frequency (AF) = AC / AN |
| 19 | + df["af"] = df.ac / df.an |
| 20 | + return df |
| 21 | + |
| 22 | + |
| 23 | +def read_allele_frequency(dataset_uri: str, region: str) -> pandas.DataFrame(): |
| 24 | + """ |
| 25 | + Read variant status |
| 26 | +
|
| 27 | + :param dataset_uri: dataset URI |
| 28 | + :param region: genomics region to read |
| 29 | + """ |
| 30 | + import tiledb |
| 31 | + |
| 32 | + # Get the variant stats uri |
| 33 | + with tiledb.Group(dataset_uri) as g: |
| 34 | + alleles_uri = g["variant_stats"].uri |
| 35 | + |
| 36 | + try: |
| 37 | + contig = region.split(":")[0] |
| 38 | + start, end = map(int, region.split(":")[1].split("-")) |
| 39 | + region_slice = slice(start, end) |
| 40 | + except Exception as e: |
| 41 | + raise ValueError( |
| 42 | + f"Invalid region: {region}. Expected format: contig:start-end" |
| 43 | + ) from e |
| 44 | + |
| 45 | + with tiledb.open(alleles_uri) as A: |
| 46 | + df = A.query(attrs=["ac", "allele"], dims=["pos", "contig"]).df[ |
| 47 | + contig, region_slice |
| 48 | + ] |
| 49 | + |
| 50 | + return _calc_af(df) |
0 commit comments