@@ -116,7 +116,7 @@ def readInAndFormatData(sequencesFile, indiciesToKeep, blockSize=1000):
116
116
# possible nucleotide symbols
117
117
categories = ['-' ,'A' , 'C' , 'G' , 'T' ]
118
118
columns = [f"{ i } _{ c } " for i in indiciesToKeep for c in categories ]
119
- refRow = { f" { i } _ { c } " : 1 for i , c in zip ( indiciesToKeep , encodeSeq (referenceSeq , indiciesToKeep ))}
119
+ refRow = [ r == c for r in encodeSeq (referenceSeq , indiciesToKeep ) for c in categories ]
120
120
121
121
print ("loading model " + datetime .now ().strftime ("%m/%d/%Y, %H:%M:%S" ))
122
122
loaded_model = joblib .load (args .model_file )
@@ -129,17 +129,15 @@ def readInAndFormatData(sequencesFile, indiciesToKeep, blockSize=1000):
129
129
len (seqList ), datetime .now ().strftime ("%m/%d/%Y, %H:%M:%S" )
130
130
))
131
131
132
- rows = [{ f" { i } _ { c } " : 1 for i , c in zip ( indiciesToKeep , row )} for row in seqList ]
132
+ rows = [[ r == c for r in row for c in categories ] for row in seqList ]
133
133
# the reference seq must be added to everry block to make sure that the
134
134
# spots in the reference have Ns are in the dataframe to guarentee that
135
135
# the correct number of columns is created when get_dummies is called
136
136
rows .append (refRow )
137
137
idList .append (referenceId )
138
138
139
139
# create a data from from the seqList
140
- df = pd .DataFrame .from_records (rows , columns = columns )
141
- df .fillna (0 , inplace = True )
142
- df = df .astype ('uint8' )
140
+ df = pd .DataFrame (rows , columns = columns ).astype ('uint8' )
143
141
144
142
predictions = loaded_model .predict_proba (df )
145
143
0 commit comments