From 85cee56338484e5dd37ba3b879d8d32f24634991 Mon Sep 17 00:00:00 2001 From: danielgeiszler Date: Sat, 13 Apr 2024 11:01:23 +0300 Subject: [PATCH] Complete lazy computation for PSMFile class, speeds up parsing by avoiding unnecessary string splitting --- .../umich/andykong/ptmshepherd/PSMFile.java | 66 +++++++++++++++++-- .../IterativeLocalizer.java | 45 +++++++++++-- 2 files changed, 101 insertions(+), 10 deletions(-) diff --git a/src/edu/umich/andykong/ptmshepherd/PSMFile.java b/src/edu/umich/andykong/ptmshepherd/PSMFile.java index 3721a5e..46a1d8b 100644 --- a/src/edu/umich/andykong/ptmshepherd/PSMFile.java +++ b/src/edu/umich/andykong/ptmshepherd/PSMFile.java @@ -71,7 +71,8 @@ public class PSM { PSM(int lineNum, String line) { this.lineNum = lineNum; - this.spLine = new ArrayList<>(StringParsingUtils.splitStringByTab(line.replace("\n",""))); + this.spLine = new ArrayList<>(StringParsingUtils.splitStringByTab( + line.replace("\n",""))); this.fileName = null; this.spec = null; this.specNum = -1; @@ -81,29 +82,58 @@ public class PSM { this.dMass = null; } + PSM(int lineNum) { + this.lineNum = lineNum; + this.spLine = null; + this.fileName = null; + this.spec = null; + this.specNum = -1; + this.pep = null; + this.mods = null; + this.modArr = null; + this.dMass = null; + } + + private void calculateSpLine() { + this.spLine = new ArrayList<>(StringParsingUtils.splitStringByTab( + data.get(this.lineNum).replace("\n",""))); + } + public void addValAtColumn(int colIdx, String val){ + if (this.spLine == null) + this.calculateSpLine(); this.spLine.add(colIdx, val); } public void addValAtColumn(String colName, String val){ + if (this.spLine == null) + this.calculateSpLine(); this.spLine.add(getColumn(colName), val); } public void replaceValAtColumn(int colIdx, String val){ + if (this.spLine == null) + this.calculateSpLine(); this.spLine.set(colIdx, val); } public void replaceValAtColumn(String colName, String val){ + if (this.spLine == null) + this.calculateSpLine(); this.spLine.set(getColumn(colName), val); } public String getSpec() { + if (this.spLine == null) + this.calculateSpLine(); if (this.spec == null) this.spec = reNormName(spLine.get(getColumn("Spectrum"))); return this.spec; } public int getSpecNum() { + if (this.spLine == null) + this.calculateSpLine(); if (this.specNum == -1) { if (this.spec == null) { this.getSpec(); @@ -115,18 +145,24 @@ public int getSpecNum() { } public String getFileName() { + if (this.spLine == null) + this.calculateSpLine(); if (this.fileName == null) this.fileName = this.getSpec().substring(0, this.getSpec().indexOf(".")); return this.fileName; } public String getPep() { + if (this.spLine == null) + this.calculateSpLine(); if (this.pep == null) this.pep = spLine.get(getColumn("Peptide")); return this.pep; } public ArrayList> getMods() { + if (this.spLine == null) + this.calculateSpLine(); if (this.mods == null) { this.mods = new ArrayList<>(); String strMods = spLine.get(getColumn("Assigned Modifications")); @@ -152,6 +188,8 @@ else if (spos.equals("c")) } public float [] getModsAsArray() { + if (this.spLine == null) + this.calculateSpLine(); if (this.modArr == null) { this.modArr = new float[this.getPep().length()]; Arrays.fill(this.modArr, 0.0f); @@ -166,15 +204,25 @@ else if (spos.equals("c")) } public float getDMass() { + if (this.spLine == null) + this.calculateSpLine(); if (this.dMass == null) this.dMass = Float.parseFloat(this.spLine.get(getColumn("Delta Mass"))); return this.dMass; } public String getColumnValue(String colName) { + if (this.spLine == null) + this.calculateSpLine(); return this.spLine.get(getColumn(colName)); } + public ArrayList getSpLine() { + if (this.spLine == null) + this.calculateSpLine(); + return this.spLine; + } + public String toString() { String newStr = String.join("\t", this.spLine); return newStr; @@ -223,7 +271,7 @@ public static String getCRC32(File f) throws Exception { * @return PSM */ private PSM getRawLine(int i) { - PSM psm = new PSM(i, data.get(i)); + PSM psm = new PSM(i); return psm; } @@ -809,15 +857,19 @@ public PSMFile(File f) throws Exception { } in.close(); - // Build index of PSM file lines to spectrum names and pre parse spec names - this.scanToLine = new HashMap<>(); + // Prefill PSM array with dummy variables so they can be constructed upon being called for (int i = 0; i < this.data.size(); i++) { PSM tPSM = this.getRawLine(i); psms.add(tPSM); - this.scanToLine.put(tPSM.getSpec(), i); } } + private void constructScanToLineMap() { + this.scanToLine = new HashMap<>(); + for (int i = 0; i < this.psms.size(); i++) + this.scanToLine.put(this.psms.get(i).getSpec(), i); + } + public void annotateMassDiffs(String [] annotations) throws IOException { /* find column to modify, overwrite Observed Modifications col if exists */ int annoCol = getColumn("Observed Modifications"); @@ -936,6 +988,10 @@ public void addColumn(int colIndx, String newHead, ArrayList keys, Array "editing PSM table will fail\n"); } + // Check that the scan to line mapping has been constructed + if (this.scanToLine == null) + constructScanToLineMap(); + //TODO if column not found and inserting to the right, it will insert at the beginning of the table // This fixes it, but the error message should be different. Not sure how to check colIndx if (colIndx == 0) diff --git a/src/edu/umich/andykong/ptmshepherd/iterativelocalization/IterativeLocalizer.java b/src/edu/umich/andykong/ptmshepherd/iterativelocalization/IterativeLocalizer.java index ed8180e..155e9f3 100644 --- a/src/edu/umich/andykong/ptmshepherd/iterativelocalization/IterativeLocalizer.java +++ b/src/edu/umich/andykong/ptmshepherd/iterativelocalization/IterativeLocalizer.java @@ -43,6 +43,8 @@ public class IterativeLocalizer { static int scanNum; Map localizationLikelihoodMap; + Map>> psmToRunToLine; + Map> convergedPsmsMap; static boolean debugFlag; boolean printIonDistribution = true; // TODO make this a parameter boolean poissonBinomialDistribution = true; // TODO make this a parameter @@ -197,6 +199,12 @@ private void calculateLocalizationProbabilities() throws Exception { // Set up likelihood store so that values can be accessed without recomputing this.localizationLikelihoodMap = new HashMap<>(); + // Set up converged scans map so that PSMs are reprocessed if not necessary + this.convergedPsmsMap = new HashMap<>(); + + // Set up run -> line mapping so that the whole PSM table doesn't get parsed every iteration + this.psmToRunToLine = new HashMap<>(); + // Faster access to zero bin spectra to be ignored double zbL = this.peaks[1][this.zeroBin]; // TODO: set up custom bounds? double zbR = this.peaks[2][this.zeroBin]; // TODO: set up custom bounds? @@ -216,8 +224,18 @@ private void calculateLocalizationProbabilities() throws Exception { // Loop through PSM files for (int i = 0; i < dsData.size(); i++) { - PSMFile psmf = new PSMFile(new File(dsData.get(i)[0])); - HashMap> runToLine = psmf.getRunMappings(); + String psmfStr = dsData.get(i)[0]; + PSMFile psmf = new PSMFile(psmfStr); + + // Get run to line mappings, if first run calculate, else get preprocessed list to prevent extra parsing + HashMap> runToLine; + if (epoch == 1) {//todo simplify logic + runToLine = psmf.getRunMappings(); //TODO this is what's parsing it every time + this.psmToRunToLine.put(psmfStr, runToLine); + } else + runToLine = this.psmToRunToLine.get(psmfStr); + + // These hold the output to insert into the PSM table //todo only need to be declared on final run ArrayList specNames = new ArrayList<>(); ArrayList strOutputProbs = new ArrayList<>(); ArrayList strMaxProbs = new ArrayList<>(); @@ -237,6 +255,15 @@ private void calculateLocalizationProbabilities() throws Exception { this.scanNum = -1; debugFlag = false; + // First, check if we can skip this line because the bin is converged + if (!finalPass) { + if (this.convergedPsmsMap.containsKey(cf)) { // check if run is in map + if (this.convergedPsmsMap.get(cf).contains(j)) { // check if scan is in converged bin + continue; + } + } + } + PSMFile.PSM psm = psmf.getLine(j); float dMass = psm.getDMass(); String pep = psm.getPep(); @@ -268,9 +295,18 @@ private void calculateLocalizationProbabilities() throws Exception { continue; } - // If converged skip unless final round - if (!finalPass && this.priorProbs[cBin].getIsConverged()) // Safe because left is first + // If converged, skip. Unless final round, then calculate final values. + // If converged and not in converged map, add to converged map. + if (!finalPass && this.priorProbs[cBin].getIsConverged()) { // safe because left is first + + if (!finalPass) { + if (!this.convergedPsmsMap.containsKey(cf)) // check if run is in map + this.convergedPsmsMap.put(cf, new HashSet<>(runToLine.get(cf).size()+1,1.0f)); + if (!this.convergedPsmsMap.get(cf).contains(j)) // check if scan is in converged bin + this.convergedPsmsMap.get(cf).add(j); // add scan to list of converged scans + } continue; + } // todo this logic is getting way too complex, need to handle execution states in a static context Spectrum spec = null; @@ -1139,5 +1175,4 @@ private double safeDivide(int x, int y) { else return (double) x / (double) y; } - } \ No newline at end of file