-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathsnpMAP.py
More file actions
95 lines (64 loc) · 2.62 KB
/
snpMAP.py
File metadata and controls
95 lines (64 loc) · 2.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# This will be the startup script
import os
import numpy as np
from utils import parser
from utils.dir_utils import prepare_directiories, clear_cache
from features.feature_extactor import FeatureExtractor
from preprocessing import preprocess
from classification.model import SnpMAPModel, model_trained
from io_utils.writing import ResultsWriter
from constants import default_mv_fields
config = parser.load_config()
cache_dir = config['output']['cache_dir']
results_dir = config['output']['results_dir']
fpaths = {
'train': config['input']['clinvar_path'], # does not need to be shrunken or annotated
'work': config['input']['case_path']
}
def run(mode, SNP_type, classification):
shrunken_fpath = preprocess.shrink_vcf(fpaths[mode], mode)
annotated_fpath = preprocess.annotate_vcf(shrunken_fpath, mode)
SNPs_path = preprocess.extract_SNPs(
annotated_fpath, SNP_type=SNP_type, mode=mode)
# hgvs_fpath = preprocess.extract_hgvs_ids(SNPs_path)
labels_fpath = preprocess.extract_labels(SNPs_path, mode, classification)
fe = FeatureExtractor(
SNPs_path, SNP_type=SNP_type, mode=mode,
mv_fields=default_mv_fields[SNP_type])
fe.construct_features()
features_fpath, features_labels_fpath = fe.make_dataset(
labels_fpath, classification)
# print(features_fpath, features_labels_fpath)
model = SnpMAPModel(features_fpath, features_labels_fpath, mode=mode, SNP_type=SNP_type)
if mode == 'work':
results = model.predict()
return results['results']
elif mode == 'train':
model.train()
return None
def main():
prepare_directiories(cache_dir, results_dir)
results = []
for SNP_type in ['nsSNP', 'sSNP']:
classification = config['classification']
if config['retrain'] == 'True' or not model_trained(SNP_type):
print('Preprating to train the model.')
# clear_cache('train', cache_dir)
run('train', SNP_type, classification)
print('Model training DONE.')
print(f'Preparing to classify the {SNP_type} variants the case file.')
r = run('work', SNP_type, classification)
print(f'Prediction for the type {SNP_type} finished.')
results.append(r)
print(np.unique(list(results[0].values()), return_counts=True))
print(np.unique(list(results[1].values()), return_counts=True))
# print(len(results[0]))
# print(len(results[1]))
R = results[1]
R.update(results[0])
# print(len(R))
writer = ResultsWriter(fpaths['work'])
writer.write_results(R)
print('DONE')
if __name__ == "__main__":
main()