-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_dce_mp.py
150 lines (108 loc) · 5.14 KB
/
get_dce_mp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import time
from IPython.display import clear_output
# some prob need fitting now...
from utils_dce import get_views_coord
from utils_dce import test_val_train
from utils_dce import sample_conflict_timeline
from utils_dce import get_hyper_priors
from utils_dce import predict
from utils_dce import get_mse
from utils_dce import get_metrics
import pymc3 as pm
import theano
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn import metrics
import warnings
warnings.simplefilter("ignore", UserWarning)
# get df:
#pkl_file = open('/home/simon/Documents/Articles/conflict_prediction/data/computerome/currents/sce_pred_df.pkl', 'rb')
pkl_file = open('/home/projects/ku_00017/data/generated/currents/sce_pred_df.pkl' , 'rb')
sce_pred_df = pickle.load(pkl_file)
pkl_file.close()
# get df:
#path = '/home/simon/Documents/Articles/conflict_prediction/data/ViEWS'
path = '/home/projects/ku_00017/data/generated/currents'
file_name = 'ViEWS_coord.pkl'
df_views_coord = get_views_coord(path = path, file_name = file_name)
# Notice: so you get all obs now but naturally mu etc. is only there for the set used in get_sce_pred_df
df = pd.merge(sce_pred_df, df_views_coord[['id', 'pg_id', 'month_id', 'month', 'year','gwcode', 'xcoord', 'ycoord','ged_best_sb', 'ged_dummy_sb']],how = 'outer', on = ['month_id', 'xcoord', 'ycoord'])
df.rename(columns = {'mu' : 'sce_mu', 'var': 'sce_var'}, inplace=True)
# ---------------------------------------------------------------------------------------
# as a start we just still assume two trends.
# you need to take do the whole full experimt thing agina..
# in the end, you need to import an use cm_pred in the predicitons to get anything acurrate: yoh want complimentary signals!
# ---------------------------------------------------------------------------------------
# as a start we just still assume two trends.
# dict for the dfs/dicts holding the results
out_dict = {}
# minimum number of conf in timeslines predicted. C = 0 for full run
C_pred = 1#0
# minimum number of conf in one year in timeslines used to est hyper parameters
C_est = 8 # 8
# conflict type. Som might need lower c_est than 100 to work
# so now this should be sce_mu - so you do not need to take log.
conf_type = 'sce_mu'
# short term kernel
s_kernel = 'Matern32'
# Start timer
start_time = time.time()
# So, here is the thing. you only have trian in df_merge right now....
# get train and validation id:
train_id, val_id = test_val_train(df, test_time= False)
print(f"{C_est}_{C_pred}_{conf_type}_{s_kernel}\n")
# Constuction the gps and getting the map
hps = get_hyper_priors() # these might need some changing...
with pm.Model() as model:
# short term trend/irregularities ---------------------------------
ℓ_s = pm.Gamma("ℓ_s", alpha=hps['ℓ_alpha_s'] , beta=hps['ℓ_beta_s'])
η_s = pm.HalfCauchy("η_s", beta=hps['η_beta_s'])
# mean func for short term trend
mean_s = pm.gp.mean.Zero()
# cov short term trend
cov_s = η_s ** 2 * pm.gp.cov.Matern32(1, ℓ_s)
# GP short term trend
gp_s = pm.gp.Marginal(mean_func = mean_s, cov_func=cov_s)
# long term trend -------------------------------------------------
ℓ_l = pm.Gamma("ℓ_l", alpha=hps['ℓ_alpha_l'] , beta=hps['ℓ_beta_l'])
η_l = pm.HalfCauchy("η_l", beta=hps['η_beta_l'])
# mean and kernal for long term trend
mean_l = pm.gp.mean.Zero()
cov_l = η_l **2 * pm.gp.cov.ExpQuad(1, ℓ_l) # Cov func.
# GP short term trend
gp_l = pm.gp.Marginal(mean_func = mean_l, cov_func=cov_l)
# noise (constant "white noise") -----------------------------------
σ = pm.HalfCauchy("σ", beta=hps['σ_beta'])
# sample and split X,y ---------------------------------------------
sample_pr_id = sample_conflict_timeline(conf_type = conf_type, df = df, train_id = train_id, test_id = val_id, C = C_est)
# Full GP ----------------------------------------------------------
gp = gp_s + gp_l
df_sorted = df.sort_values(['pg_id', 'month_id'])
# shared:
pg_len = df_sorted[df_sorted['id'].isin(train_id)]['month_id'].unique().shape[0]
X = theano.shared(np.zeros(pg_len)[:,None], 'X')
y = theano.shared(np.zeros(pg_len), 'y')
# sample:
for i, j in enumerate(sample_pr_id):
print(f'Time-line {i+1}/{sample_pr_id.shape[0]} in the works (estimation)...', end= '\r')
X.set_value(df_sorted[(df_sorted['id'].isin(train_id)) & (df_sorted['pg_id'] == j)]['month_id'].values[:,None])
y.set_value(df_sorted[(df_sorted['id'].isin(train_id)) & (df_sorted['pg_id'] == j)][conf_type].values)
y_ = gp.marginal_likelihood(f'y_{i}', X=X, y=y, noise= σ)
mp = pm.find_MAP()
print('Got mp')
print('Pickling..')
new_file_name = '/home/projects/ku_00017/data/generated/currents/dce_mp.pkl'
output = open(new_file_name, 'wb')
pickle.dump(mp, output)
output.close()
# end timer
final_time = time.time()
final_run_time = final_time - start_time
string = f'Run for {final_run_time/60:.3} minutes'
print(string)