Skip to content

Commit 7854de9

Browse files
committed
Shave off another second or so
1 parent 33c8d03 commit 7854de9

File tree

1 file changed

+3
-5
lines changed

1 file changed

+3
-5
lines changed

pangolin/scripts/pangolearn.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ def readInAndFormatData(sequencesFile, indiciesToKeep, blockSize=1000):
116116
# possible nucleotide symbols
117117
categories = ['-','A', 'C', 'G', 'T']
118118
columns = [f"{i}_{c}" for i in indiciesToKeep for c in categories]
119-
refRow = {f"{i}_{c}": 1 for i,c in zip(indiciesToKeep, encodeSeq(referenceSeq, indiciesToKeep))}
119+
refRow = [r==c for r in encodeSeq(referenceSeq, indiciesToKeep) for c in categories]
120120

121121
print("loading model " + datetime.now().strftime("%m/%d/%Y, %H:%M:%S"))
122122
loaded_model = joblib.load(args.model_file)
@@ -129,17 +129,15 @@ def readInAndFormatData(sequencesFile, indiciesToKeep, blockSize=1000):
129129
len(seqList), datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
130130
))
131131

132-
rows = [{f"{i}_{c}": 1 for i,c in zip(indiciesToKeep, row)} for row in seqList]
132+
rows = [[r==c for r in row for c in categories] for row in seqList]
133133
# the reference seq must be added to everry block to make sure that the
134134
# spots in the reference have Ns are in the dataframe to guarentee that
135135
# the correct number of columns is created when get_dummies is called
136136
rows.append(refRow)
137137
idList.append(referenceId)
138138

139139
# create a data from from the seqList
140-
df = pd.DataFrame.from_records(rows, columns=columns)
141-
df.fillna(0, inplace=True)
142-
df = df.astype('uint8')
140+
df = pd.DataFrame(rows, columns=columns).astype('uint8')
143141

144142
predictions = loaded_model.predict_proba(df)
145143

0 commit comments

Comments
 (0)