Skip to content

Commit 9d54b1a

Browse files
perf: speedup fasta reader
Here I restructured `FastaReader::read()` function to avoid extra allocations. This is basically a complete rewrite. This involved changing error handling, so I thought while we are at it, why not improve error messages. While working on the new algo, I also discovered a few important cases missing from tests, so I added them. Speedup on mpox-500 run: ~29% Command: ``` $ cargo -q build --release --target-dir=/workdir/.build/docker --bin=treetime $ hyperfine --warmup 1 --show-output '/workdir/.build/docker/release/treetime ancestral --method-anc=parsimony --dense=false --tree=data/mpox/clade-ii/500/tree.nwk --outdir=tmp/smoke-tests/ancestral/marginal/mpox/clade-ii/500 data/mpox/clade-ii/500/aln.fasta.xz' ``` Before (branch: rust, commit f676392): ``` Time (mean ± σ): 1.866 s ± 0.113 s [User: 4.547 s, System: 0.576 s] Range (min … max): 1.768 s … 2.041 s 10 runs ``` After: ``` Time (mean ± σ): 1.444 s ± 0.111 s [User: 4.181 s, System: 0.527 s] Range (min … max): 1.341 s … 1.650 s 10 runs ``` Profiler shows that `treetime::io::fasta::FastaReader::read()` now takes ~18% of the program, down from 25% (in single-threaded mode).
1 parent f676392 commit 9d54b1a

File tree

2 files changed

+79
-78
lines changed

2 files changed

+79
-78
lines changed

packages/treetime/src/io/fasta.rs

Lines changed: 75 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@ use crate::io::compression::Decompressor;
33
use crate::io::concat::Concat;
44
use crate::io::file::{create_file_or_stdout, open_file_or_stdin, open_stdin};
55
use crate::make_error;
6-
use eyre::Report;
7-
use log::info;
6+
use crate::utils::string::quote_single;
7+
use eyre::{Context, Report};
8+
use itertools::Itertools;
9+
use log::{info, warn};
810
use serde::{Deserialize, Serialize};
911
use std::io::{BufRead, BufReader};
1012
use std::path::Path;
@@ -33,12 +35,21 @@ impl FastaRecord {
3335
pub fn is_empty(&self) -> bool {
3436
self.seq_name.is_empty() && self.seq_name.is_empty() && self.desc.is_none() && self.index == 0
3537
}
38+
39+
pub fn header(&self) -> String {
40+
match &self.desc {
41+
Some(desc) => format!(">{} {}", self.seq_name, desc),
42+
None => format!(">{}", self.seq_name),
43+
}
44+
}
3645
}
3746

3847
pub struct FastaReader<'a, 'b> {
3948
reader: Box<dyn BufRead + 'a>,
4049
alphabet: &'b Alphabet,
4150
line: String,
51+
n_lines: usize,
52+
n_chars: usize,
4253
index: usize,
4354
}
4455

@@ -48,6 +59,8 @@ impl<'a, 'b> FastaReader<'a, 'b> {
4859
reader,
4960
alphabet,
5061
line: String::new(),
62+
n_lines: 0,
63+
n_chars: 0,
5164
index: 0,
5265
}
5366
}
@@ -94,100 +107,71 @@ impl<'a, 'b> FastaReader<'a, 'b> {
94107
record.clear();
95108

96109
if self.line.is_empty() {
110+
// Read lines until we find the next record or EOF
97111
loop {
98112
self.line.clear();
113+
if self.reader.read_line(&mut self.line)? == 0 {
114+
if self.index > 0 {
115+
// We have read at least one record by the end of input - this is normal operation mode
116+
return Ok(());
117+
}
118+
119+
if self.index == 0 && self.n_chars == 0 {
120+
// We have read no records and no non-whitespace characters by the end of input
121+
warn!("FASTA input is empty or consists entirely from whitespace: this is allowed but might not be what's intended");
122+
return Ok(());
123+
}
99124

100-
let n_bytes = self.reader.read_line(&mut self.line)?;
101-
if n_bytes == 0 {
102-
return Ok(());
125+
// We have read some characters, but no records detected by the end of input
126+
return make_error!(
127+
"FASTA input is incorrectly formatted: expected at least one FASTA record starting with character '>', but none found"
128+
);
103129
}
104130

105131
let trimmed = self.line.trim();
106-
if !trimmed.is_empty() {
107-
self.line = trimmed.to_owned();
108-
break;
132+
self.n_lines += 1;
133+
self.n_chars += trimmed.len();
134+
if trimmed.starts_with('>') {
135+
break; // Found the header of the next record
109136
}
110137
}
111138
}
112139

113-
if !self.line.starts_with('>') {
114-
return make_error!("Expected character '>' at record start.");
115-
}
116-
117-
let s = self.line[1..].trim();
118-
119-
let mut parts = s.splitn(2, ' ');
120-
let name = parts.next().unwrap_or_default().to_owned();
121-
let desc = parts.next().map(ToOwned::to_owned);
122-
123-
record.seq_name = name;
124-
record.desc = desc;
125-
126-
loop {
127-
self.line.clear();
128-
129-
let n_bytes = self.reader.read_line(&mut self.line)?;
130-
if n_bytes == 0 {
131-
record.index = self.index;
132-
self.index += 1;
133-
return Ok(());
134-
}
140+
let header_line = self.line.trim();
141+
let (name, desc) = header_line[1..].split_once(' ').unwrap_or((&header_line[1..], ""));
142+
record.seq_name = name.to_owned();
143+
record.desc = if desc.is_empty() { None } else { Some(desc.to_owned()) };
144+
record.index = self.index;
145+
self.index += 1;
135146

147+
// Read sequence lines until the next header or EOF
148+
self.line.clear();
149+
while self.reader.read_line(&mut self.line)? > 0 {
136150
let trimmed = self.line.trim();
137-
if !trimmed.is_empty() {
138-
self.line = trimmed.to_owned();
151+
self.n_lines += 1;
152+
self.n_chars += trimmed.len();
153+
if trimmed.starts_with('>') {
154+
// We have reached the next record
139155
break;
140156
}
141-
}
142-
143-
if self.line.is_empty() || self.line.starts_with('>') {
144-
record.index = self.index;
145-
self.index += 1;
146-
return Ok(());
147-
}
148157

149-
let fragment = self
150-
.line
151-
.chars()
152-
.map(|c| {
153-
let c = c.to_ascii_uppercase();
154-
if !self.alphabet.contains(c) {
155-
make_error!("Character is not in the alphabet: '{c}'")
158+
record.seq.reserve(trimmed.len());
159+
for c in trimmed.chars() {
160+
let uc = c.to_ascii_uppercase();
161+
if self.alphabet.contains(uc) {
162+
record.seq.push(uc);
156163
} else {
157-
Ok(c)
164+
return make_error!(
165+
"FASTA input is incorrect: character \"{c}\" is not in the alphabet. Expected characters: {}",
166+
self.alphabet.chars().map(quote_single).join(", ")
167+
)
168+
.wrap_err_with(|| format!("When processing sequence #{}: \"{}\"", self.index, record.header()));
158169
}
159-
})
160-
.collect::<Result<Vec<char>, Report>>()?;
161-
162-
record.seq.extend(&fragment);
163-
164-
loop {
165-
self.line.clear();
166-
self.reader.read_line(&mut self.line)?;
167-
self.line = self.line.trim().to_owned();
168-
if self.line.is_empty() || self.line.starts_with('>') {
169-
break;
170170
}
171171

172-
let fragment = self
173-
.line
174-
.chars()
175-
.map(|c| {
176-
let c = c.to_ascii_uppercase();
177-
if !self.alphabet.contains(c) {
178-
make_error!("Character is not in the alphabet: '{c}'")
179-
} else {
180-
Ok(c)
181-
}
182-
})
183-
.collect::<Result<Vec<char>, Report>>()?;
184-
185-
record.seq.extend(&fragment);
172+
self.line.clear();
186173
}
187174

188-
record.index = self.index;
189-
self.index += 1;
190-
191175
Ok(())
192176
}
193177
}
@@ -294,6 +278,7 @@ mod tests {
294278
use super::*;
295279
use crate::alphabet::alphabet::AlphabetName;
296280
use crate::o;
281+
use crate::utils::error::report_to_string;
297282
use indoc::indoc;
298283
use lazy_static::lazy_static;
299284
use pretty_assertions::assert_eq;
@@ -312,10 +297,20 @@ mod tests {
312297
let mut record = FastaRecord::new();
313298
assert_eq!(
314299
reader.read(&mut record).unwrap_err().to_string(),
315-
"Expected character '>' at record start."
300+
"FASTA input is incorrectly formatted: expected at least one FASTA record starting with character '>', but none found"
316301
);
317302
}
318303

304+
#[test]
305+
fn test_fasta_reader_fail_on_unknown_char() {
306+
let data = b">seq%1\nACGT%ACGT\n";
307+
let mut reader = FastaReader::new(Box::new(Cursor::new(data)), &NUC_ALPHABET);
308+
let mut record = FastaRecord::new();
309+
let actual = report_to_string(&reader.read(&mut record).unwrap_err());
310+
let expected = r#"When processing sequence #1: ">seq%1": FASTA input is incorrect: character "%" is not in the alphabet. Expected characters: '-', 'A', 'B', 'C', 'D', 'G', 'H', 'K', 'M', 'N', 'R', 'S', 'T', 'V', 'W', 'Y'"#;
311+
assert_eq!(expected, actual);
312+
}
313+
319314
#[test]
320315
fn test_fasta_reader_read_empty() {
321316
let data = b"";
@@ -568,6 +563,8 @@ mod tests {
568563
ACGT
569564
>Identifier Description with spaces
570565
ACGT
566+
567+
571568
"#},
572569
&NUC_ALPHABET,
573570
)?;
@@ -646,8 +643,8 @@ mod tests {
646643
index: 4,
647644
},
648645
FastaRecord {
649-
seq_name: o!("SneezeC-19"),
650-
desc: None,
646+
seq_name: o!(""),
647+
desc: Some(o!("SneezeC-19")),
651648
seq: o!("CCGGCGATGTRTTG--"),
652649
index: 5,
653650
},

packages/treetime/src/utils/string.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,7 @@ pub fn vec_to_string(v: Vec<char>) -> String {
1616
pub fn quote(x: impl Display) -> String {
1717
format!("\"{x}\"")
1818
}
19+
20+
pub fn quote_single(x: impl Display) -> String {
21+
format!("'{x}'")
22+
}

0 commit comments

Comments
 (0)