forked from qlu-lab/POP-TOOLS
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcompute.py
129 lines (101 loc) · 5.5 KB
/
compute.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import polars as pl
from scipy.special import chdtrc
from utils import const_dict
import numpy as np
def _z_to_beta(z, n):
return z / n.sqrt() # return beta
def _beta_se_to_z_p(beta, se):
try:
z = beta / se
except (FloatingPointError, ZeroDivisionError):
z = float('inf')
p = chdtrc(1, z ** 2)
return z, p
def _linear_to_log_or(beta, case_prop, eaf):
return beta / (case_prop * (1 - case_prop))
def _effect_std_to_allele(effect_std, factor):
# return effect_allele
return effect_std / factor
def _case_prop(n_case, n):
return n_case / n
def _cal_qt_wtd(z_df, n, N1, N2, r12, r13, r23, eaf):
# r:
## r12: correlation between y_lab and yhat_lab
## r13: correlation between y_lab and yhat_unlab
## r23: correlation between yhat_lab and yhat_unlab
# n:
## n: number of samples in y_lab
## N1: number of samples in yhat_lab
## N2: number of samples in yhat_unlab
rho12 = r12 * (1/N1).sqrt() * (1/n).sqrt()
rho13 = r13 * (1/N2).sqrt() * (1/n).sqrt()
rho23 = r23 * (1/N1).sqrt() * (1/N2).sqrt()
omega = (rho12 - rho13) / (1/N1 + 1/N2 - 2*rho23)
# 1 Transform the Z-score into effect size
beta_yhat_unlab = _z_to_beta(pl.col(const_dict['Z_YHAT_UNLAB']), N2)
beta_yhat_lab = _z_to_beta(pl.col(const_dict['Z_YHAT_LAB']), N1)
beta_y_lab = _z_to_beta(pl.col(const_dict['Z_Y_LAB']), n)
# 2 Calculate the corrected effect size and its correspond standard error
beta_popgwas = beta_y_lab + omega * (beta_yhat_unlab - beta_yhat_lab)
se_popgwas = (1/n - (rho12 - rho13) / (1/N1 + 1/N2 - 2 * rho23) * (rho12 - rho13)).sqrt()
z_popgwas, p_popgwas = _beta_se_to_z_p(beta_popgwas, se_popgwas)
n_eff = 1 / se_popgwas**2
# 3 Transform the standardized scale into the allele scale
if eaf is not None:
factor_eaf = (2*eaf*(1-eaf)).sqrt()
beta_popgwas = _effect_std_to_allele(beta_popgwas, factor_eaf)
se_popgwas = _effect_std_to_allele(se_popgwas, factor_eaf)
return z_df.with_columns([beta_popgwas.alias(const_dict['BETA']), se_popgwas.alias(const_dict['SE']), z_popgwas.alias(const_dict['Z']), p_popgwas.alias(const_dict['P']), n_eff.alias(const_dict['N_EFF'])]).drop([const_dict['Z_YHAT_LAB'], const_dict['Z_YHAT_UNLAB'], const_dict['Z_Y_LAB']])
# d is data calculated by qt
def _qt_to_bt(d, n, n_case, eaf):
case_prop = _case_prop(n_case, n)
or_popgwas = _linear_to_log_or(pl.col(const_dict['BETA']), case_prop, eaf)
se_popgwas = _linear_to_log_or(pl.col(const_dict['SE']), case_prop, eaf)
n_eff_case = case_prop * pl.col(const_dict['N_EFF'])
n_eff_control = (1 - case_prop) * pl.col(const_dict['N_EFF'])
return d.with_columns([or_popgwas.alias(const_dict['BETA']), se_popgwas.alias(const_dict['SE']), n_eff_case.alias(const_dict['N_EFF_CASE']), n_eff_control.alias(const_dict['N_EFF_CONTROL'])]).rename({const_dict['BETA']: const_dict['OR']})
def estimate_popgwas(z_df, n_col, N1_col, n_case_col, N2_col, eaf_col, bt, r12, r13, r23):
n, N1, n_case, N2, eaf = pl.col(n_col), pl.col(N1_col), pl.col(n_case_col) if bt else None, pl.col(N2_col), pl.col(eaf_col)
r12 = pl.lit(r12)
r13 = pl.lit(r13)
r23 = pl.lit(r23)
d = _cal_qt_wtd(z_df=z_df.lazy(), n=n, N1=N1, N2=N2, r12=r12, r13 = r13, r23 = r23, eaf=eaf)
return _qt_to_bt(d=d, n=n, n_case=n_case, eaf=eaf).drop([n_col, N1_col, n_case_col, N2_col]) if bt else d.drop([n_col, N1_col, N2_col])
def estimate_poprare(z_df, n_col, N1_col, n_case_col, N2_col, eaf_col, bt, ovp, log, r12=None):
n, N1, n_case, N2, eaf = pl.col(n_col), pl.col(N1_col), pl.col(n_case_col) if bt else None, pl.col(N2_col), pl.col(eaf_col)
corr_matrix = z_df.select(["Z_y_lab", "Z_yhat_unlab", "Z_yhat_lab"]).corr()
if r12 is None:
r12 = float(corr_matrix[0, 2])
else:
r12 = r12
if ovp == False:
r13 = 0
r23 = 0
log.log(f"--- The imputation quality r = {r12:.3f}")
else:
r13 = float(corr_matrix[0, 1])
r23 = float(corr_matrix[1, 2])
log.log(f"\n--- The correlation between RVAS estimates on y and yhat in labeled data = {r12:.3f}")
log.log(f"--- The correlation between RVAS estimates on y in labeled data and yhat in unlabeled data = {r13:.3f}")
log.log(f"--- The correlation between RVAS estimates on yhat in labeled and unlabeled data = {r23:.3f}")
d = _cal_qt_wtd(z_df=z_df.lazy(), n=n, N1=N1, N2=N2, r12=r12, r13=r13, r23=r23, eaf=eaf)
return _qt_to_bt(d=d, n=n, n_case=n_case, eaf=eaf).drop([n_col, N1_col, n_case_col, N2_col]) if bt else d.drop([n_col, N1_col, N2_col])
def estimate_popburden(z_df, n_col, N1_col, N2_col, ovp, log, r12=None):
n, N1, N2 = pl.col(n_col), pl.col(N1_col), pl.col(N2_col)
corr_matrix = z_df.select(["Z_y_lab", "Z_yhat_unlab", "Z_yhat_lab"]).corr()
if r12 is None:
r12 = float(corr_matrix[0, 2])
else:
r12 = r12
if ovp == False:
r13 = 0
r23 = 0
log.log(f"--- The imputation quality r = {r12:.3f}")
else:
r13 = float(corr_matrix[0, 1])
r23 = float(corr_matrix[1, 2])
log.log(f"\n--- The correlation between RVAS estimates on y and yhat in labeled data = {r12:.3f}")
log.log(f"--- The correlation between RVAS estimates on y in labeled data and yhat in unlabeled data = {r13:.3f}")
log.log(f"--- The correlation between RVAS estimates on yhat in labeled and unlabeled data = {r23:.3f}")
d = _cal_qt_wtd(z_df=z_df.lazy(), n=n, N1=N1, N2=N2, r12=r12, r13=r13, r23=r23, eaf= None)
return d.drop([n_col, N1_col, N2_col])