11#!/usr/bin/env python3
22
33import pandas as pd
4+ import numpy as np
45from sklearn .model_selection import train_test_split
56from sklearn .linear_model import LogisticRegression
67from sklearn import metrics
@@ -113,6 +114,10 @@ def readInAndFormatData(sequencesFile, indiciesToKeep, blockSize=1000):
113114indiciesToKeep = model_headers [1 :]
114115
115116referenceSeq = findReferenceSeq ()
117+ # possible nucleotide symbols
118+ categories = ['-' ,'A' , 'C' , 'G' , 'T' ]
119+ columns = [f"{ i } _{ c } " for i in indiciesToKeep for c in categories ]
120+ refRow = [r == c for r in encodeSeq (referenceSeq , indiciesToKeep ) for c in categories ]
116121
117122print ("loading model " + datetime .now ().strftime ("%m/%d/%Y, %H:%M:%S" ))
118123loaded_model = joblib .load (args .model_file )
@@ -125,30 +130,16 @@ def readInAndFormatData(sequencesFile, indiciesToKeep, blockSize=1000):
125130 len (seqList ), datetime .now ().strftime ("%m/%d/%Y, %H:%M:%S" )
126131 ))
127132
133+ rows = [[r == c for r in row for c in categories ] for row in seqList ]
128134 # the reference seq must be added to everry block to make sure that the
129135 # spots in the reference have Ns are in the dataframe to guarentee that
130136 # the correct number of columns is created when get_dummies is called
137+ rows .append (refRow )
131138 idList .append (referenceId )
132- seqList .append (encodeSeq (referenceSeq , indiciesToKeep ))
133139
134140 # create a data from from the seqList
135- df = pd .DataFrame (seqList , columns = indiciesToKeep )
136-
137- # possible nucleotide symbols
138- categories = ['A' , 'C' , 'G' , 'T' , '-' ]
139-
140- # add extra rows to ensure all of the categories are represented, as otherwise
141- # not enough columns will be created when we call get_dummies
142- extra_rows = [[i ] * len (indiciesToKeep ) for i in categories ]
143- df = pd .concat ([df , pd .DataFrame (extra_rows , columns = indiciesToKeep )], ignore_index = True )
144-
145- # get one-hot encoding
146- df = pd .get_dummies (df , columns = indiciesToKeep )
147-
148- headers = list (df )
149-
150- # get rid of the fake data we just added
151- df .drop (df .tail (len (categories )).index ,inplace = True )
141+ d = np .array (rows , np .uint8 )
142+ df = pd .DataFrame (d , columns = columns )
152143
153144 predictions = loaded_model .predict_proba (df )
154145
0 commit comments