-
Notifications
You must be signed in to change notification settings - Fork 10
Expand file tree
/
Copy pathab.py
More file actions
130 lines (120 loc) · 5.3 KB
/
ab.py
File metadata and controls
130 lines (120 loc) · 5.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from __future__ import annotations
from copy import deepcopy
from typing import Any
from ..comparators import TTest, UTest
from ..dataset import (Dataset, ExperimentData, StatisticRole, TargetRole,
TreatmentRole)
from ..extensions.statsmodels import MultiTest, MultitestQuantile
from ..utils import (ID_SPLIT_SYMBOL, NAME_BORDER_SYMBOL, ABNTestMethodsEnum,
BackendsEnum)
from .abstract import Analyzer
class ABAnalyzer(Analyzer):
def __init__(
self,
multitest_method: ABNTestMethodsEnum | None = None,
alpha: float = 0.05,
equal_variance: bool = True,
quantiles: float | list[float] | None = None,
iteration_size: int = 20000,
random_state: int | None = None,
key: Any = "",
):
self.multitest_method = multitest_method
self.alpha = alpha
self.equal_variance = equal_variance
self.quantiles = quantiles
self.iteration_size = iteration_size
self.random_state = random_state
super().__init__(key)
def execute_multitest(self, data: ExperimentData, p_values: Dataset, **kwargs):
group_field = data.ds.search_columns(TreatmentRole())[0]
target_fields = data.ds.search_columns(TargetRole(), search_types=[int, float])
if self.multitest_method and len(data.groups[group_field]) > 2:
if self.multitest_method != ABNTestMethodsEnum.quantile:
multitest_result = MultiTest(self.multitest_method).calc(
p_values, **kwargs
)
groups = []
for i in list(data.groups[group_field].keys())[1:]:
groups += [i] * len(target_fields)
multitest_result = multitest_result.add_column(
groups
* (
len(multitest_result)
// len(target_fields)
// (len(data.groups[group_field]) - 1)
),
role={"group": StatisticRole()},
)
else:
multitest_result = Dataset.create_empty()
for target_field in target_fields:
multitest_result = multitest_result.append(
MultitestQuantile(
self.alpha,
self.iteration_size,
self.equal_variance,
self.random_state,
).calc(
p_values,
group_field=group_field,
target_field=target_field,
quantiles=self.quantiles,
)
)
return self._set_value(data, multitest_result, key="MultiTest")
return data
def _add_pvalues(self, multitest_pvalues, value, field):
if (
self.multitest_method
and field == "p-value"
and self.multitest_method != "quantile"
):
multitest_pvalues = multitest_pvalues.append(value)
return multitest_pvalues
def execute(self, data: ExperimentData) -> ExperimentData:
executor_ids = data.get_ids([TTest, UTest])
num_groups = len(data.groups[data.ds.search_columns(TreatmentRole())[0]]) - 1
groups = list(data.groups[data.ds.search_columns(TreatmentRole())[0]].items())
multitest_pvalues = Dataset.create_empty()
analysis_data = {}
for c, spaces in executor_ids.items():
analysis_ids = spaces.get("analysis_tables", [])
if len(analysis_ids) == 0:
continue
t_data = deepcopy(data.analysis_tables[analysis_ids[0]])
for aid in analysis_ids[1:]:
t_data = t_data.append(data.analysis_tables[aid])
if len(analysis_ids) < len(t_data):
analysis_ids *= num_groups
t_data.data.index = analysis_ids
for f in ["p-value", "pass"]:
for i in range(0, len(analysis_ids), len(analysis_ids) // num_groups):
value = t_data.iloc[i : i + len(analysis_ids) // num_groups][f]
multitest_pvalues = self._add_pvalues(multitest_pvalues, value, f)
analysis_data[f"{c} {f} {groups[i // num_groups + 1][0]}"] = (
value.mean()
)
if c not in ["UTest", "TTest"]:
indexes = t_data.index
values = t_data.data.values.tolist()
for idx, value in zip(indexes, values):
name = idx.split(ID_SPLIT_SYMBOL)[-1]
analysis_data[
f"{c} {name[name.find(NAME_BORDER_SYMBOL) + 1 : name.rfind(NAME_BORDER_SYMBOL)]}"
] = value[0]
analysis_dataset = Dataset.from_dict(
[analysis_data],
{f: StatisticRole() for f in analysis_data},
BackendsEnum.pandas,
)
data = self.execute_multitest(
data,
(
multitest_pvalues
if not multitest_pvalues.is_empty()
and self.multitest_method != ABNTestMethodsEnum.quantile
else data.ds
),
)
return self._set_value(data, analysis_dataset)