-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathposthoc_rand_index.py
100 lines (77 loc) · 3.82 KB
/
posthoc_rand_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import numpy as np
import pandas as pd
import sys
import sklearn.metrics
import sklearn.cluster
import re
import cluster_metrics
def main():
raw_results_dir = sys.argv[1]
results_file = sys.argv[2]
model_kcs = int(sys.argv[3])
all_results_df = pd.read_csv(results_file)
datasets = set(all_results_df['dataset'])
models = set(all_results_df['model'])
ri_col = np.zeros(all_results_df.shape[0])
fi_col = np.zeros_like(ri_col)
vi_col = np.zeros_like(fi_col)
rul_col = np.zeros_like(ri_col)
for dataset_name in datasets:
df = pd.read_csv("data/datasets/%s.csv" % dataset_name)
ref_assignment = get_problem_skill_assignment(df)
ul = cluster_metrics.recovered_upper_limit(ref_assignment, model_kcs)
print(ul)
problem_feature_mat = np.load("data/datasets/%s.embeddings.npy" % dataset_name.replace('_blocked', '').replace('_interleaved',''))
for model in models:
ri = []
fi = []
vi = []
pred_assignments = []
if model.startswith('sd'):
if 'realistic' in dataset_name:
params_path = "%s/%s_%s.params.npy.npz" % (raw_results_dir, re.sub(r'^sd', 'sd-50kcs', model), dataset_name)
else:
params_path = "%s/%s_%s.params.npy.npz" % (raw_results_dir, model, dataset_name)
params = np.load(params_path)
Aprior = params['Aprior'] # Splits x Problems x KCs
for i in range(Aprior.shape[0]):
Q = Aprior[i, :, :]
pred_assignment = np.argmax(Q, axis=1)
pred_assignments.append(pred_assignment)
elif model.startswith('clustering') and not dataset_name.startswith('sd_1_'):
splits = np.load("data/splits/%s.npy" % dataset_name)
for s in range(splits.shape[0]):
split = splits[s, :]
train_ix = split == 2
train_df = df[train_ix]
#
# build problem clustering model based on training problems only
#
train_problems = sorted(pd.unique(train_df['problem']))
train_problem_features = problem_feature_mat[train_problems, :]
kmeans_model = sklearn.cluster.KMeans(n_clusters=model_kcs, n_init='auto', random_state=0).fit(train_problem_features)
problem_labels = kmeans_model.predict(problem_feature_mat) # predict labels for all problems
pred_assignments.append(problem_labels)
else:
continue
for pred_assignment in pred_assignments:
ri.append(sklearn.metrics.rand_score(ref_assignment, pred_assignment))
#fi.append(cluster_metrics.fmeasure(ref_assignment, pred_assignment))
vi.append(cluster_metrics.recovered(ref_assignment, pred_assignment, recall_thres=0.75, precision_thres=0.75))
ix = (all_results_df['model'] == model) & (all_results_df['dataset'] == dataset_name)
ri_col[ix] = ri
#fi_col[ix] = fi
vi_col[ix] = vi
rul_col[ix] = ul
all_results_df['raw_rand_index'] = ri_col
#all_results_df['fmeasure'] = fi_col
all_results_df['recovered'] = vi_col
all_results_df['recovered_ul'] = rul_col
all_results_df.to_csv(results_file, index=False)
print(all_results_df)
def get_problem_skill_assignment(df):
problems_to_skills = dict(zip(df['problem'], df['skill']))
n_problems = np.max(df['problem']) + 1
return np.array([problems_to_skills[p] for p in range(n_problems)])
if __name__ == "__main__":
main()