@@ -3,8 +3,10 @@ use crate::io::compression::Decompressor;
3
3
use crate :: io:: concat:: Concat ;
4
4
use crate :: io:: file:: { create_file_or_stdout, open_file_or_stdin, open_stdin} ;
5
5
use crate :: make_error;
6
- use eyre:: Report ;
7
- use log:: info;
6
+ use crate :: utils:: string:: quote_single;
7
+ use eyre:: { Context , Report } ;
8
+ use itertools:: Itertools ;
9
+ use log:: { info, warn} ;
8
10
use serde:: { Deserialize , Serialize } ;
9
11
use std:: io:: { BufRead , BufReader } ;
10
12
use std:: path:: Path ;
@@ -33,12 +35,21 @@ impl FastaRecord {
33
35
pub fn is_empty ( & self ) -> bool {
34
36
self . seq_name . is_empty ( ) && self . seq_name . is_empty ( ) && self . desc . is_none ( ) && self . index == 0
35
37
}
38
+
39
+ pub fn header ( & self ) -> String {
40
+ match & self . desc {
41
+ Some ( desc) => format ! ( ">{} {}" , self . seq_name, desc) ,
42
+ None => format ! ( ">{}" , self . seq_name) ,
43
+ }
44
+ }
36
45
}
37
46
38
47
pub struct FastaReader < ' a , ' b > {
39
48
reader : Box < dyn BufRead + ' a > ,
40
49
alphabet : & ' b Alphabet ,
41
50
line : String ,
51
+ n_lines : usize ,
52
+ n_chars : usize ,
42
53
index : usize ,
43
54
}
44
55
@@ -48,6 +59,8 @@ impl<'a, 'b> FastaReader<'a, 'b> {
48
59
reader,
49
60
alphabet,
50
61
line : String :: new ( ) ,
62
+ n_lines : 0 ,
63
+ n_chars : 0 ,
51
64
index : 0 ,
52
65
}
53
66
}
@@ -94,100 +107,71 @@ impl<'a, 'b> FastaReader<'a, 'b> {
94
107
record. clear ( ) ;
95
108
96
109
if self . line . is_empty ( ) {
110
+ // Read lines until we find the next record or EOF
97
111
loop {
98
112
self . line . clear ( ) ;
113
+ if self . reader . read_line ( & mut self . line ) ? == 0 {
114
+ if self . index > 0 {
115
+ // We have read at least one record by the end of input - this is normal operation mode
116
+ return Ok ( ( ) ) ;
117
+ }
118
+
119
+ if self . index == 0 && self . n_chars == 0 {
120
+ // We have read no records and no non-whitespace characters by the end of input
121
+ warn ! ( "FASTA input is empty or consists entirely from whitespace: this is allowed but might not be what's intended" ) ;
122
+ return Ok ( ( ) ) ;
123
+ }
99
124
100
- let n_bytes = self . reader . read_line ( & mut self . line ) ?;
101
- if n_bytes == 0 {
102
- return Ok ( ( ) ) ;
125
+ // We have read some characters, but no records detected by the end of input
126
+ return make_error ! (
127
+ "FASTA input is incorrectly formatted: expected at least one FASTA record starting with character '>', but none found"
128
+ ) ;
103
129
}
104
130
105
131
let trimmed = self . line . trim ( ) ;
106
- if !trimmed. is_empty ( ) {
107
- self . line = trimmed. to_owned ( ) ;
108
- break ;
132
+ self . n_lines += 1 ;
133
+ self . n_chars += trimmed. len ( ) ;
134
+ if trimmed. starts_with ( '>' ) {
135
+ break ; // Found the header of the next record
109
136
}
110
137
}
111
138
}
112
139
113
- if !self . line . starts_with ( '>' ) {
114
- return make_error ! ( "Expected character '>' at record start." ) ;
115
- }
116
-
117
- let s = self . line [ 1 ..] . trim ( ) ;
118
-
119
- let mut parts = s. splitn ( 2 , ' ' ) ;
120
- let name = parts. next ( ) . unwrap_or_default ( ) . to_owned ( ) ;
121
- let desc = parts. next ( ) . map ( ToOwned :: to_owned) ;
122
-
123
- record. seq_name = name;
124
- record. desc = desc;
125
-
126
- loop {
127
- self . line . clear ( ) ;
128
-
129
- let n_bytes = self . reader . read_line ( & mut self . line ) ?;
130
- if n_bytes == 0 {
131
- record. index = self . index ;
132
- self . index += 1 ;
133
- return Ok ( ( ) ) ;
134
- }
140
+ let header_line = self . line . trim ( ) ;
141
+ let ( name, desc) = header_line[ 1 ..] . split_once ( ' ' ) . unwrap_or ( ( & header_line[ 1 ..] , "" ) ) ;
142
+ record. seq_name = name. to_owned ( ) ;
143
+ record. desc = if desc. is_empty ( ) { None } else { Some ( desc. to_owned ( ) ) } ;
144
+ record. index = self . index ;
145
+ self . index += 1 ;
135
146
147
+ // Read sequence lines until the next header or EOF
148
+ self . line . clear ( ) ;
149
+ while self . reader . read_line ( & mut self . line ) ? > 0 {
136
150
let trimmed = self . line . trim ( ) ;
137
- if !trimmed. is_empty ( ) {
138
- self . line = trimmed. to_owned ( ) ;
151
+ self . n_lines += 1 ;
152
+ self . n_chars += trimmed. len ( ) ;
153
+ if trimmed. starts_with ( '>' ) {
154
+ // We have reached the next record
139
155
break ;
140
156
}
141
- }
142
-
143
- if self . line . is_empty ( ) || self . line . starts_with ( '>' ) {
144
- record. index = self . index ;
145
- self . index += 1 ;
146
- return Ok ( ( ) ) ;
147
- }
148
157
149
- let fragment = self
150
- . line
151
- . chars ( )
152
- . map ( |c| {
153
- let c = c. to_ascii_uppercase ( ) ;
154
- if !self . alphabet . contains ( c) {
155
- make_error ! ( "Character is not in the alphabet: '{c}'" )
158
+ record. seq . reserve ( trimmed. len ( ) ) ;
159
+ for c in trimmed. chars ( ) {
160
+ let uc = c. to_ascii_uppercase ( ) ;
161
+ if self . alphabet . contains ( uc) {
162
+ record. seq . push ( uc) ;
156
163
} else {
157
- Ok ( c)
164
+ return make_error ! (
165
+ "FASTA input is incorrect: character \" {c}\" is not in the alphabet. Expected characters: {}" ,
166
+ self . alphabet. chars( ) . map( quote_single) . join( ", " )
167
+ )
168
+ . wrap_err_with ( || format ! ( "When processing sequence #{}: \" {}\" " , self . index, record. header( ) ) ) ;
158
169
}
159
- } )
160
- . collect :: < Result < Vec < char > , Report > > ( ) ?;
161
-
162
- record. seq . extend ( & fragment) ;
163
-
164
- loop {
165
- self . line . clear ( ) ;
166
- self . reader . read_line ( & mut self . line ) ?;
167
- self . line = self . line . trim ( ) . to_owned ( ) ;
168
- if self . line . is_empty ( ) || self . line . starts_with ( '>' ) {
169
- break ;
170
170
}
171
171
172
- let fragment = self
173
- . line
174
- . chars ( )
175
- . map ( |c| {
176
- let c = c. to_ascii_uppercase ( ) ;
177
- if !self . alphabet . contains ( c) {
178
- make_error ! ( "Character is not in the alphabet: '{c}'" )
179
- } else {
180
- Ok ( c)
181
- }
182
- } )
183
- . collect :: < Result < Vec < char > , Report > > ( ) ?;
184
-
185
- record. seq . extend ( & fragment) ;
172
+ self . line . clear ( ) ;
186
173
}
187
174
188
- record. index = self . index ;
189
- self . index += 1 ;
190
-
191
175
Ok ( ( ) )
192
176
}
193
177
}
@@ -294,6 +278,7 @@ mod tests {
294
278
use super :: * ;
295
279
use crate :: alphabet:: alphabet:: AlphabetName ;
296
280
use crate :: o;
281
+ use crate :: utils:: error:: report_to_string;
297
282
use indoc:: indoc;
298
283
use lazy_static:: lazy_static;
299
284
use pretty_assertions:: assert_eq;
@@ -312,10 +297,20 @@ mod tests {
312
297
let mut record = FastaRecord :: new ( ) ;
313
298
assert_eq ! (
314
299
reader. read( & mut record) . unwrap_err( ) . to_string( ) ,
315
- "Expected character '>' at record start. "
300
+ "FASTA input is incorrectly formatted: expected at least one FASTA record starting with character '>', but none found "
316
301
) ;
317
302
}
318
303
304
+ #[ test]
305
+ fn test_fasta_reader_fail_on_unknown_char ( ) {
306
+ let data = b">seq%1\n ACGT%ACGT\n " ;
307
+ let mut reader = FastaReader :: new ( Box :: new ( Cursor :: new ( data) ) , & NUC_ALPHABET ) ;
308
+ let mut record = FastaRecord :: new ( ) ;
309
+ let actual = report_to_string ( & reader. read ( & mut record) . unwrap_err ( ) ) ;
310
+ let expected = r#"When processing sequence #1: ">seq%1": FASTA input is incorrect: character "%" is not in the alphabet. Expected characters: '-', 'A', 'B', 'C', 'D', 'G', 'H', 'K', 'M', 'N', 'R', 'S', 'T', 'V', 'W', 'Y'"# ;
311
+ assert_eq ! ( expected, actual) ;
312
+ }
313
+
319
314
#[ test]
320
315
fn test_fasta_reader_read_empty ( ) {
321
316
let data = b"" ;
@@ -568,6 +563,8 @@ mod tests {
568
563
ACGT
569
564
>Identifier Description with spaces
570
565
ACGT
566
+
567
+
571
568
"# } ,
572
569
& NUC_ALPHABET ,
573
570
) ?;
@@ -646,8 +643,8 @@ mod tests {
646
643
index: 4 ,
647
644
} ,
648
645
FastaRecord {
649
- seq_name: o!( "SneezeC-19 " ) ,
650
- desc: None ,
646
+ seq_name: o!( "" ) ,
647
+ desc: Some ( o! ( "SneezeC-19" ) ) ,
651
648
seq: o!( "CCGGCGATGTRTTG--" ) ,
652
649
index: 5 ,
653
650
} ,
0 commit comments