forked from tqian/binary-outcome-mrt
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdgm_binary_categorical_covariate.R
114 lines (82 loc) · 3.25 KB
/
dgm_binary_categorical_covariate.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# for binary outcome
# Tianchen Qian, 2018.07.24
# In this code, I constructed examples where:
# p_t is constant (0.5)
# S_t (covariate) takes 3 values
# and the log linear GEE is biased for beta.
expit <- function(x){
return(exp(x)/(1+exp(x)))
}
dgm_binary_categorical_covariate <- function(sample_size, total_T) {
# same DGM as dgm_binary above, but faster
baseline_Y_S0 <- 0.2
baseline_Y_S1 <- 0.5
baseline_Y_S2 <- 0.4
beta_0 <- 0.1
beta_1 <- 0.3
prob_a <- 0.2
df_names <- c("userid", "day", "Y", "A", "S", "S2", "prob_Y", "prob_Y_A0", "prob_A")
dta <- data.frame(matrix(NA, nrow = sample_size * total_T, ncol = length(df_names)))
names(dta) <- df_names
dta$userid <- rep(1:sample_size, each = total_T)
dta$day <- rep(1:total_T, times = sample_size)
for (t in 1:total_T) {
# row index for the rows corresponding to day t for every subject
row_index <- seq(from = t, by = total_T, length = sample_size)
dta$S[row_index] <- sample(c(0,1,2), sample_size, replace = TRUE)
dta$S2[row_index] <- ifelse(dta$S[row_index] == 2, 1, 0)
dta$prob_A[row_index] <- rep(prob_a, sample_size)
dta$A[row_index] <- rbinom(sample_size, 1, dta$prob_A[row_index])
dta$prob_Y_A0[row_index] <- ifelse(dta$S[row_index] == 0, baseline_Y_S0,
ifelse(dta$S[row_index] == 1, baseline_Y_S1, baseline_Y_S2))
dta$prob_Y[row_index] <- dta$prob_Y_A0[row_index] * exp(dta$A[row_index] * (beta_0 + beta_1 * dta$S[row_index]))
dta$Y[row_index] <- rbinom(sample_size, 1, dta$prob_Y[row_index])
}
return(dta)
}
## true beta for (Intercept, S)
beta_true <- c(0.1, 0.3)
## true beta for Intercept
beta_true_marginal <- 0.4770512
## true alpha for (Intercept, S, S2)
alpha_true <- c(-1.6094379, 0.9162907, -1.1394343)
# try out the range of Y
if (0) {
set.seed(123)
dta <- dgm_binary_two_covariate(100, 30)
summary(dta$prob_Y)
summary(dta$prob_A)
}
# compute marginal beta_true
if (0) {
### numerically, we have ###
set.seed(123)
sample_size <- 500000
total_T <- 30
dta <- dgm_binary_categorical_covariate(sample_size, total_T)
tmp <- aggregate(Y ~ day + A, dta, mean)
beta_true_marginal <- log(sum(tmp$Y[(total_T+1):(2*total_T)]) / sum(tmp$Y[1:total_T]))
print(beta_true_marginal)
# p_a = 0.5, beta_true_marginal = 0.4777817
# p_a = 0.2, beta_true_marginal = 0.4780562
### analytically, we have ###
baseline_Y_S0 <- 0.2
baseline_Y_S1 <- 0.5
baseline_Y_S2 <- 0.4
beta_0 <- 0.1
beta_1 <- 0.3
prob_a <- 0.2
numerator <- baseline_Y_S0 * exp(beta_0) + baseline_Y_S1 * exp(beta_0 + beta_1) + baseline_Y_S2 * exp(beta_0 + 2 * beta_1)
denominator <- baseline_Y_S0 + baseline_Y_S1 + baseline_Y_S2
log(numerator / denominator)
}
# compute true alpha for (Intercept, S, S2)
if (0) {
baseline_Y_S0 <- 0.2
baseline_Y_S1 <- 0.5
baseline_Y_S2 <- 0.4
alpha_0 <- log(baseline_Y_S0)
alpha_1 <- log(baseline_Y_S1 / baseline_Y_S0)
alpha_2 <- log(baseline_Y_S2) - log(baseline_Y_S0) - 2 * log(baseline_Y_S1 / baseline_Y_S0)
print(c(alpha_0, alpha_1, alpha_2))
}