From 7854de92ce6212696f2b7b8f373523e87cc603f0 Mon Sep 17 00:00:00 2001 From: Ben Taylor Date: Thu, 4 Feb 2021 19:46:54 +0100 Subject: [PATCH] Shave off another second or so --- pangolin/scripts/pangolearn.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pangolin/scripts/pangolearn.py b/pangolin/scripts/pangolearn.py index 5485980..91f577d 100644 --- a/pangolin/scripts/pangolearn.py +++ b/pangolin/scripts/pangolearn.py @@ -116,7 +116,7 @@ def readInAndFormatData(sequencesFile, indiciesToKeep, blockSize=1000): # possible nucleotide symbols categories = ['-','A', 'C', 'G', 'T'] columns = [f"{i}_{c}" for i in indiciesToKeep for c in categories] -refRow = {f"{i}_{c}": 1 for i,c in zip(indiciesToKeep, encodeSeq(referenceSeq, indiciesToKeep))} +refRow = [r==c for r in encodeSeq(referenceSeq, indiciesToKeep) for c in categories] print("loading model " + datetime.now().strftime("%m/%d/%Y, %H:%M:%S")) loaded_model = joblib.load(args.model_file) @@ -129,7 +129,7 @@ def readInAndFormatData(sequencesFile, indiciesToKeep, blockSize=1000): len(seqList), datetime.now().strftime("%m/%d/%Y, %H:%M:%S") )) - rows = [{f"{i}_{c}": 1 for i,c in zip(indiciesToKeep, row)} for row in seqList] + rows = [[r==c for r in row for c in categories] for row in seqList] # the reference seq must be added to everry block to make sure that the # spots in the reference have Ns are in the dataframe to guarentee that # the correct number of columns is created when get_dummies is called @@ -137,9 +137,7 @@ def readInAndFormatData(sequencesFile, indiciesToKeep, blockSize=1000): idList.append(referenceId) # create a data from from the seqList - df = pd.DataFrame.from_records(rows, columns=columns) - df.fillna(0, inplace=True) - df = df.astype('uint8') + df = pd.DataFrame(rows, columns=columns).astype('uint8') predictions = loaded_model.predict_proba(df)