1
+ """
2
+ Functions that generate data sets used in examples
3
+ """
1
4
import numpy as np
2
5
import pandas as pd
3
6
from scipy .stats import dirichlet , gamma , norm , uniform
11
14
def _smoothed_gaussian_random_walk (
12
15
gaussian_random_walk_mu , gaussian_random_walk_sigma , N , lowess_kwargs
13
16
):
17
+ """
18
+ Generates Gaussian random walk data and applies LOWESS
19
+
20
+ :param gaussian_random_walk_mu:
21
+ Mean of the random walk
22
+ :param gaussian_random_walk_sigma:
23
+ Standard deviation of the random walk
24
+ :param N:
25
+ Length of the random walk
26
+ :param lowess_kwargs:
27
+ Keyword argument dictionary passed to statsmodels lowess
28
+ """
14
29
x = np .arange (N )
15
30
y = norm (gaussian_random_walk_mu , gaussian_random_walk_sigma ).rvs (N ).cumsum ()
16
31
filtered = lowess (y , x , ** lowess_kwargs )
@@ -26,12 +41,25 @@ def generate_synthetic_control_data(
26
41
lowess_kwargs = default_lowess_kwargs ,
27
42
):
28
43
"""
29
- Example:
30
- >> import pathlib
31
- >> df, weightings_true = generate_synthetic_control_data(
32
- treatment_time=treatment_time
33
- )
34
- >> df.to_csv(pathlib.Path.cwd() / 'synthetic_control.csv', index=False)
44
+ Generates data for synthetic control example.
45
+
46
+ :param N:
47
+ Number fo data points
48
+ :param treatment_time:
49
+ Index where treatment begins in the generated dataframe
50
+ :param grw_mu:
51
+ Mean of Gaussian Random Walk
52
+ :param grw_sigma:
53
+ Standard deviation of Gaussian Random Walk
54
+ :lowess_kwargs:
55
+ Keyword argument dictionary passed to statsmodels lowess
56
+
57
+ Example
58
+ --------
59
+ >>> from causalpy.data.simulate_data import generate_synthetic_control_data
60
+ >>> df, weightings_true = generate_synthetic_control_data(
61
+ ... treatment_time=70
62
+ ... )
35
63
"""
36
64
37
65
# 1. Generate non-treated variables
@@ -70,6 +98,21 @@ def generate_synthetic_control_data(
70
98
def generate_time_series_data (
71
99
N = 100 , treatment_time = 70 , beta_temp = - 1 , beta_linear = 0.5 , beta_intercept = 3
72
100
):
101
+ """
102
+ Generates interrupted time series example data
103
+
104
+ :param N:
105
+ Length of the time series
106
+ :param treatment_time:
107
+ Index of when treatment begins
108
+ :param beta_temp:
109
+ The temperature coefficient
110
+ :param beta_linear:
111
+ The linear coefficient
112
+ :param beta_intercept:
113
+ The intercept
114
+
115
+ """
73
116
x = np .arange (0 , 100 , 1 )
74
117
df = pd .DataFrame (
75
118
{
@@ -99,6 +142,9 @@ def generate_time_series_data(
99
142
100
143
101
144
def generate_time_series_data_seasonal (treatment_time ):
145
+ """
146
+ Generates 10 years of monthly data with seasonality
147
+ """
102
148
dates = pd .date_range (
103
149
start = pd .to_datetime ("2010-01-01" ), end = pd .to_datetime ("2020-01-01" ), freq = "M"
104
150
)
@@ -146,6 +192,14 @@ def generate_time_series_data_simple(treatment_time, slope=0.0):
146
192
147
193
148
194
def generate_did ():
195
+ """
196
+ Generate Difference in Differences data
197
+
198
+ Example
199
+ --------
200
+ >>> from causalpy.data.simulate_data import generate_did
201
+ >>> df = generate_did()
202
+ """
149
203
# true parameters
150
204
control_intercept = 1
151
205
treat_intercept_delta = 0.25
@@ -157,6 +211,7 @@ def generate_did():
157
211
def outcome (
158
212
t , control_intercept , treat_intercept_delta , trend , Δ , group , post_treatment
159
213
):
214
+ """Compute the outcome of each unit"""
160
215
return (
161
216
control_intercept
162
217
+ (treat_intercept_delta * group )
@@ -191,16 +246,23 @@ def generate_regression_discontinuity_data(
191
246
N = 100 , true_causal_impact = 0.5 , true_treatment_threshold = 0.0
192
247
):
193
248
"""
194
- Example use:
195
- >> import pathlib
196
- >> df = generate_regression_discontinuity_data(true_treatment_threshold=0.5)
197
- >> df.to_csv(pathlib.Path.cwd() / 'regression_discontinuity.csv', index=False)
249
+ Generate regression discontinuity example data
250
+
251
+ Example
252
+ --------
253
+ >>> import pathlib
254
+ >>> from causalpy.data.simulate_data import generate_regression_discontinuity_data
255
+ >>> df = generate_regression_discontinuity_data(true_treatment_threshold=0.5)
256
+ >>> df.to_csv(pathlib.Path.cwd() / 'regression_discontinuity.csv',
257
+ ... index=False) # doctest: +SKIP
198
258
"""
199
259
200
260
def is_treated (x ):
261
+ """Check if x was treated"""
201
262
return np .greater_equal (x , true_treatment_threshold )
202
263
203
264
def impact (x ):
265
+ """Assign true_causal_impact to all treaated entries"""
204
266
y = np .zeros (len (x ))
205
267
y [is_treated (x )] = true_causal_impact
206
268
return y
@@ -214,6 +276,22 @@ def impact(x):
214
276
def generate_ancova_data (
215
277
N = 200 , pre_treatment_means = np .array ([10 , 12 ]), treatment_effect = 2 , sigma = 1
216
278
):
279
+ """
280
+ Generate ANCOVA eample data
281
+
282
+ Example
283
+ --------
284
+ >>> import pathlib
285
+ >>> from causalpy.data.simulate_data import generate_ancova_data
286
+ >>> df = generate_ancova_data(
287
+ ... N=200,
288
+ ... pre_treatment_means=np.array([10, 12]),
289
+ ... treatment_effect=2,
290
+ ... sigma=1
291
+ ... )
292
+ >>> df.to_csv(pathlib.Path.cwd() / 'ancova_data.csv',
293
+ ... index=False) # doctest: +SKIP
294
+ """
217
295
group = np .random .choice (2 , size = N )
218
296
pre = np .random .normal (loc = pre_treatment_means [group ])
219
297
post = pre + treatment_effect * group + np .random .normal (size = N ) * sigma
@@ -233,6 +311,10 @@ def generate_geolift_data():
233
311
causal_impact = 0.2
234
312
235
313
def create_series (n = 52 , amplitude = 1 , length_scale = 2 ):
314
+ """
315
+ Returns numpy tile with generated seasonality data repeated over
316
+ multiple years
317
+ """
236
318
return np .tile (
237
319
generate_seasonality (n = n , amplitude = amplitude , length_scale = 2 ) + 3 , n_years
238
320
)
0 commit comments