1
1
#!/usr/bin/env python3
2
2
3
3
import pandas as pd
4
+ import numpy as np
4
5
from sklearn .model_selection import train_test_split
5
6
from sklearn .linear_model import LogisticRegression
6
7
from sklearn import metrics
@@ -113,6 +114,10 @@ def readInAndFormatData(sequencesFile, indiciesToKeep, blockSize=1000):
113
114
indiciesToKeep = model_headers [1 :]
114
115
115
116
referenceSeq = findReferenceSeq ()
117
+ # possible nucleotide symbols
118
+ categories = ['-' ,'A' , 'C' , 'G' , 'T' ]
119
+ columns = [f"{ i } _{ c } " for i in indiciesToKeep for c in categories ]
120
+ refRow = [r == c for r in encodeSeq (referenceSeq , indiciesToKeep ) for c in categories ]
116
121
117
122
print ("loading model " + datetime .now ().strftime ("%m/%d/%Y, %H:%M:%S" ))
118
123
loaded_model = joblib .load (args .model_file )
@@ -125,30 +130,16 @@ def readInAndFormatData(sequencesFile, indiciesToKeep, blockSize=1000):
125
130
len (seqList ), datetime .now ().strftime ("%m/%d/%Y, %H:%M:%S" )
126
131
))
127
132
133
+ rows = [[r == c for r in row for c in categories ] for row in seqList ]
128
134
# the reference seq must be added to everry block to make sure that the
129
135
# spots in the reference have Ns are in the dataframe to guarentee that
130
136
# the correct number of columns is created when get_dummies is called
137
+ rows .append (refRow )
131
138
idList .append (referenceId )
132
- seqList .append (encodeSeq (referenceSeq , indiciesToKeep ))
133
139
134
140
# create a data from from the seqList
135
- df = pd .DataFrame (seqList , columns = indiciesToKeep )
136
-
137
- # possible nucleotide symbols
138
- categories = ['A' , 'C' , 'G' , 'T' , '-' ]
139
-
140
- # add extra rows to ensure all of the categories are represented, as otherwise
141
- # not enough columns will be created when we call get_dummies
142
- extra_rows = [[i ] * len (indiciesToKeep ) for i in categories ]
143
- df = pd .concat ([df , pd .DataFrame (extra_rows , columns = indiciesToKeep )], ignore_index = True )
144
-
145
- # get one-hot encoding
146
- df = pd .get_dummies (df , columns = indiciesToKeep )
147
-
148
- headers = list (df )
149
-
150
- # get rid of the fake data we just added
151
- df .drop (df .tail (len (categories )).index ,inplace = True )
141
+ d = np .array (rows , np .uint8 )
142
+ df = pd .DataFrame (d , columns = columns )
152
143
153
144
predictions = loaded_model .predict_proba (df )
154
145
0 commit comments