-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdataplumbing.py
77 lines (67 loc) · 2.74 KB
/
dataplumbing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#########################################################################################
# Author: Jared L. Ostmeyer
# Date Started: 2016-07-26
# Environment: Python3
# License: See LICENSE
# Purpose: Load dataset and create interfaces for piping the data to the model.
##########################################################################################
import csv
import numpy as np
import lib_paths
import atchley_factors as vector_representation
def load_repertoires(data_dir):
repertoires = dict()
with open(data_dir+'/data/answers.csv', 'r') as keyfile_stream:
keyfile_reader = csv.DictReader(keyfile_stream, delimiter=',')
for keyfile_row in keyfile_reader:
sample_id = keyfile_row['Sample']
diagnosis_id = keyfile_row['Diagnosis']
sequences = dict()
path = data_dir+'/data/sample_'+keyfile_row['Sample']+'.csv'
with open(path, 'r') as sample_stream:
sample_reader = csv.DictReader(sample_stream, delimiter=',')
for sample_row in sample_reader:
sequence = sample_row['Sequence']
count = float(sample_row['Count'])
if sequence not in sequences:
sequences[sequence] = count
else:
sequences[sequence] += count
repertoires[sample_id] = {
'Diagnosis': diagnosis_id,
'Sequences': sequences
}
return repertoires
def process_repertoires(repertoires, snip_size=6):
repertoires_snip = {}
for sample, repertoire in repertoires.items():
diagnosis = repertoire['Diagnosis']
snips = {}
for sequence, count in repertoire['Sequences'].items():
stop = len(sequence)-snip_size+1
for i in range(stop):
snip = sequence[i:i+snip_size]
if snip not in snips:
snips[snip] = count
else:
snips[snip] += count
repertoires_snip[sample] = {
'Diagnosis': diagnosis,
'Snips': snips
}
num_samples = len(repertoires)
max_snips = -1
num_features = snip_size*vector_representation.length
for sample, repertoire in repertoires_snip.items():
num_snips = len(repertoire['Snips'])
if num_snips > max_snips:
max_snips = num_snips
xs = np.zeros((num_samples, max_snips, num_features), dtype=np.float32) # Features
cs = np.zeros((num_samples, max_snips), dtype=np.float32) # Snippet count
ys = np.zeros((num_samples), dtype=np.float32) # Labels
for i, (sample, repertoire) in enumerate(sorted(repertoires_snip.items(), key=lambda item: item[0])):
for j, (snip, count) in enumerate(repertoire['Snips'].items()):
xs[i,j,:] = vector_representation.features(snip)
cs[i,j] = float(count)
ys[i] = float(repertoire['Diagnosis'])
return xs, cs, ys