-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeature.py
134 lines (116 loc) · 4.78 KB
/
feature.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python
import cPickle as pickle
import numpy as np
from collections import defaultdict
from collections import OrderedDict
from config import *
def read_pkl(name):
with open(name) as fin:
return pickle.load(fin)
class feature(object):
def __init__(self, train, valid, test):
self.train_info = read_pkl(train)
self.valid_info = read_pkl(valid)
self.test_info = read_pkl(test)
self._index_in_epoch = 0
self._epochs_completed = 0
self._num_examples = len(self.train_info)
def extract_single(self, info):
seq = info['sequence']
seqLen = len(seq)
acc = info['ACC']
ss3 = info['SS3']
pssm = info['PSSM']
sequence_profile = np.concatenate((pssm, ss3, acc), axis = 1)
#shape = (L, 26)
ccmpred = info['ccmpredZ']
psicov = info['psicovZ']
other = info['OtherPairs']
pairwise_profile = np.dstack((ccmpred, psicov))
pairwise_profile = np.concatenate((pairwise_profile, other), axis = 2) #shape = (L, L, 5)
true_contact = info['contactMatrix']
true_contact[true_contact < 0] = 0 # transfer -1 to 0, shape = (L, L)
true_contact = np.tril(true_contact, k=-6) + np.triu(true_contact, k=6) # remove the diagnol contact
return sequence_profile, pairwise_profile, true_contact
def process_feature(self, infos):
f1 = []
f2 = []
fl = []
for info in infos:
x1, x2, y = self.extract_single(info)
f1.append(x1)
f2.append(x2)
fl.append(y)
#data['features'].append([x1, x2])
#data['labels'].append(y)
#return [np.concatenate(f1, axis=0), np.concatenate(f2, axis=0), np.concatenate(fl, axis=0)]
return [np.array(f1), np.array(f2), np.array(fl)]
def get_feature(self):
# training data
self.train_data = self.process_feature(self.train_info)
# validation data
self.valid_data = self.process_feature(self.valid_info)
# testing data
self.test_data = self.process_feature(self.test_info)
return self.train_data, self.valid_data, self.test_data
def get_one(self):
return self.valid_info[0]
def next_batch(self, batch_size, shuffle = True):
start = self._index_in_epoch
if start == 0 and self._epochs_completed == 0:
idx = np.arange(0, self._num_examples) # get all possible indexes
np.random.shuffle(idx) # shuffle indexe
self._f1 = self.train_data[0][idx]
self._f2 = self.train_data[1][idx]
self._fl = self.train_data[2][idx]
# go to the next batch
if start + batch_size > self._num_examples:
self._epochs_completed += 1
rest_num_examples = self._num_examples - start
data_rest_part1 = self._f1[start:self._num_examples]
data_rest_part2 = self._f2[start:self._num_examples]
data_rest_part3 = self._fl[start:self._num_examples]
idx0 = np.arange(0, self._num_examples) # get all possible indexes
np.random.shuffle(idx0) # shuffle indexes
self._f1 = self.train_data[0][idx0]
self._f2 = self.train_data[1][idx0]
self._fl = self.train_data[2][idx0]
start = 0
self._index_in_epoch = batch_size - rest_num_examples #avoid the case where the #sample != integar times of batch_size
end = self._index_in_epoch
data_new_part1 = self._f1[start:end]
data_new_part2 = self._f2[start:end]
data_new_part3 = self._fl[start:end]
return np.concatenate((data_rest_part1, data_new_part1), axis=0), np.concatenate((data_rest_part2, data_new_part2), axis=0), np.concatenate((data_rest_part3, data_new_part3), axis=0)
else:
self._index_in_epoch += batch_size
end = self._index_in_epoch
return self._f1[start:end], self._f2[start:end], self._fl[start:end]
def test():
F = feature(train_file, valid_file, test_file)
one_data = F.get_one()
print one_data['name']
m = one_data['contactMatrix']
for x in m:
print ''.join([str(1 if i>0 else 0) for i in x])
'''
def lensBin(self):
dic = {}
count = 0
for info in self.train_info:
count += 1
l = len(info['sequence'])
if dic.has_key(l):
dic[l] += 1
else:
dic[l] = 1
order_dict = OrderedDict(dic)
for k, v in order_dict.items():
print "%d %d" %(k, v)
print 'count = %d' %count
F = feature(train_file, valid_file, test_file)
F.lensBin()
train_data, valid_data, test_data = F.get_feature()
for i in xrange(1):
batch = F.next_batch(1)
'''