1
1
use crate :: io:: json:: { json_write_str, JsonPretty } ;
2
2
use crate :: representation:: bitset128:: BitSet128 ;
3
+ use crate :: representation:: seq_char:: AsciiChar ;
3
4
use crate :: representation:: state_set:: StateSet ;
4
5
use crate :: utils:: string:: quote;
5
6
use crate :: { make_error, stateset, vec_u8} ;
@@ -16,9 +17,9 @@ use std::fmt::Display;
16
17
use std:: iter:: once;
17
18
use strum_macros:: Display ;
18
19
19
- pub const NON_CHAR : u8 = b'.' ;
20
- pub const VARIABLE_CHAR : u8 = b'~' ;
21
- pub const FILL_CHAR : u8 = b' ' ;
20
+ pub const NON_CHAR : AsciiChar = AsciiChar ( b'.' ) ;
21
+ pub const VARIABLE_CHAR : AsciiChar = AsciiChar ( b'~' ) ;
22
+ pub const FILL_CHAR : AsciiChar = AsciiChar ( b' ' ) ;
22
23
23
24
#[ derive( Copy , Clone , Debug , PartialEq , Eq , PartialOrd , Ord , ArgEnum , SmartDefault , Display ) ]
24
25
#[ clap( rename = "kebab-case" ) ]
@@ -28,33 +29,33 @@ pub enum AlphabetName {
28
29
Aa ,
29
30
}
30
31
31
- pub type ProfileMap = IndexMap < u8 , Array1 < f64 > > ;
32
- pub type StateSetMap = IndexMap < u8 , StateSet > ;
33
- pub type CharToSet = IndexMap < u8 , StateSet > ;
34
- pub type SetToChar = IndexMap < StateSet , u8 > ;
32
+ pub type ProfileMap = IndexMap < AsciiChar , Array1 < f64 > > ;
33
+ pub type StateSetMap = IndexMap < AsciiChar , StateSet > ;
34
+ pub type CharToSet = IndexMap < AsciiChar , StateSet > ;
35
+ pub type SetToChar = IndexMap < StateSet , AsciiChar > ;
35
36
36
37
#[ derive( Clone , Debug , Serialize , Deserialize ) ]
37
38
pub struct Alphabet {
38
39
all : StateSet ,
39
40
canonical : StateSet ,
40
- ambiguous : IndexMap < u8 , Vec < u8 > > ,
41
+ ambiguous : IndexMap < AsciiChar , Vec < AsciiChar > > ,
41
42
ambiguous_keys : StateSet ,
42
43
determined : StateSet ,
43
44
undetermined : StateSet ,
44
- unknown : u8 ,
45
- gap : u8 ,
45
+ unknown : AsciiChar ,
46
+ gap : AsciiChar ,
46
47
treat_gap_as_unknown : bool ,
47
48
profile_map : ProfileMap ,
48
49
49
50
#[ serde( skip) ]
50
- char_to_set : IndexMap < u8 , StateSet > ,
51
+ char_to_set : IndexMap < AsciiChar , StateSet > ,
51
52
#[ serde( skip) ]
52
- set_to_char : IndexMap < StateSet , u8 > ,
53
+ set_to_char : IndexMap < StateSet , AsciiChar > ,
53
54
54
55
#[ serde( skip) ]
55
56
char_to_index : Vec < Option < usize > > ,
56
57
#[ serde( skip) ]
57
- index_to_char : Vec < u8 > ,
58
+ index_to_char : Vec < AsciiChar > ,
58
59
}
59
60
60
61
impl Default for Alphabet {
@@ -111,15 +112,21 @@ impl Alphabet {
111
112
treat_gap_as_unknown,
112
113
} = cfg;
113
114
115
+ let gap = AsciiChar :: from ( * gap) ;
116
+ let unknown = AsciiChar :: from ( * unknown) ;
117
+
114
118
let canonical = StateSet :: from_iter ( canonical) ;
115
119
if canonical. is_empty ( ) {
116
120
return make_error ! ( "When creating alphabet: canonical set of characters is empty. This is not allowed." ) ;
117
121
}
118
122
119
- let ambiguous: IndexMap < u8 , Vec < u8 > > = ambiguous. to_owned ( ) ;
123
+ let ambiguous: IndexMap < AsciiChar , Vec < AsciiChar > > = ambiguous
124
+ . iter ( )
125
+ . map ( |( k, v) | ( AsciiChar ( * k) , v. iter ( ) . copied ( ) . map ( AsciiChar ) . collect ( ) ) )
126
+ . collect ( ) ;
120
127
let ambiguous_keys = ambiguous. keys ( ) . collect ( ) ;
121
128
122
- let undetermined = stateset ! { * unknown, * gap} ;
129
+ let undetermined = stateset ! { unknown, gap} ;
123
130
let determined = StateSet :: from_union ( [ canonical, ambiguous_keys] ) ;
124
131
let all = StateSet :: from_union ( [ canonical, ambiguous_keys, undetermined] ) ;
125
132
@@ -128,7 +135,7 @@ impl Alphabet {
128
135
let mut char_to_index = vec ! [ None ; 128 ] ;
129
136
let mut index_to_char = Vec :: with_capacity ( canonical. len ( ) ) ;
130
137
for ( i, c) in canonical. iter ( ) . enumerate ( ) {
131
- char_to_index[ c as usize ] = Some ( i) ;
138
+ char_to_index[ usize:: from ( c ) ] = Some ( i) ;
132
139
index_to_char. push ( c) ;
133
140
}
134
141
@@ -137,8 +144,8 @@ impl Alphabet {
137
144
ambiguous. iter ( ) . for_each ( |( key, chars) | {
138
145
char_to_set. insert ( * key, StateSet :: from_iter ( chars) ) ;
139
146
} ) ;
140
- char_to_set. insert ( * gap, StateSet :: from_char ( * gap) ) ;
141
- char_to_set. insert ( * unknown, StateSet :: from_char ( * unknown) ) ;
147
+ char_to_set. insert ( gap, StateSet :: from_char ( gap) ) ;
148
+ char_to_set. insert ( unknown, StateSet :: from_char ( unknown) ) ;
142
149
char_to_set
143
150
} ;
144
151
@@ -153,8 +160,8 @@ impl Alphabet {
153
160
ambiguous_keys,
154
161
determined,
155
162
undetermined,
156
- unknown : * unknown ,
157
- gap : * gap ,
163
+ unknown,
164
+ gap,
158
165
treat_gap_as_unknown : * treat_gap_as_unknown,
159
166
profile_map,
160
167
char_to_set,
@@ -163,7 +170,7 @@ impl Alphabet {
163
170
}
164
171
165
172
#[ inline]
166
- pub fn get_profile ( & self , c : u8 ) -> & Array1 < f64 > {
173
+ pub fn get_profile ( & self , c : AsciiChar ) -> & Array1 < f64 > {
167
174
self
168
175
. profile_map
169
176
. get ( & c)
@@ -180,7 +187,7 @@ impl Alphabet {
180
187
pub fn construct_profile < I , T > ( & self , chars : I ) -> Result < Array1 < f64 > , Report >
181
188
where
182
189
I : IntoIterator < Item = T > ,
183
- T : Borrow < u8 > + Display ,
190
+ T : Borrow < AsciiChar > + Display ,
184
191
{
185
192
let mut profile = Array1 :: < f64 > :: zeros ( self . n_canonical ( ) ) ;
186
193
for c in chars {
@@ -193,7 +200,7 @@ impl Alphabet {
193
200
Ok ( profile)
194
201
}
195
202
196
- pub fn get_code ( & self , profile : & Array1 < f64 > ) -> u8 {
203
+ pub fn get_code ( & self , profile : & Array1 < f64 > ) -> AsciiChar {
197
204
// TODO(perf): this mapping needs to be precomputed
198
205
self
199
206
. profile_map
@@ -204,39 +211,39 @@ impl Alphabet {
204
211
}
205
212
206
213
#[ allow( single_use_lifetimes) ] // TODO: remove when anonymous lifetimes in `impl Trait` are stabilized
207
- pub fn seq2prof < ' a > ( & self , chars : impl IntoIterator < Item = & ' a u8 > ) -> Result < Array2 < f64 > , Report > {
214
+ pub fn seq2prof < ' a > ( & self , chars : impl IntoIterator < Item = & ' a AsciiChar > ) -> Result < Array2 < f64 > , Report > {
208
215
let prof = stack (
209
216
Axis ( 0 ) ,
210
217
& chars. into_iter ( ) . map ( |& c| self . get_profile ( c) . view ( ) ) . collect_vec ( ) ,
211
218
) ?;
212
219
Ok ( prof)
213
220
}
214
221
215
- pub fn set_to_char ( & self , c : StateSet ) -> u8 {
222
+ pub fn set_to_char ( & self , c : StateSet ) -> AsciiChar {
216
223
self . set_to_char [ & c]
217
224
}
218
225
219
- pub fn char_to_set ( & self , c : u8 ) -> StateSet {
220
- self . char_to_set [ & c]
226
+ pub fn char_to_set ( & self , c : impl Into < AsciiChar > ) -> StateSet {
227
+ self . char_to_set [ & c. into ( ) ]
221
228
}
222
229
223
230
/// All existing characters (including 'unknown' and 'gap')
224
- pub fn chars ( & self ) -> impl Iterator < Item = u8 > + ' _ {
231
+ pub fn chars ( & self ) -> impl Iterator < Item = AsciiChar > + ' _ {
225
232
self . all . iter ( )
226
233
}
227
234
228
235
/// Get u8 by index (indexed in the same order as given by `.chars()`)
229
- pub fn char ( & self , index : usize ) -> u8 {
236
+ pub fn char ( & self , index : usize ) -> AsciiChar {
230
237
self . index_to_char [ index]
231
238
}
232
239
233
240
/// Get index of a character (indexed in the same order as given by `.chars()`)
234
- pub fn index ( & self , c : u8 ) -> usize {
235
- self . char_to_index [ c as usize ] . unwrap ( )
241
+ pub fn index ( & self , c : impl Into < usize > ) -> usize {
242
+ self . char_to_index [ c. into ( ) ] . unwrap ( )
236
243
}
237
244
238
245
/// Check if character is in alphabet (including 'unknown' and 'gap')
239
- pub fn contains ( & self , c : u8 ) -> bool {
246
+ pub fn contains ( & self , c : AsciiChar ) -> bool {
240
247
self . all . contains ( c)
241
248
}
242
249
@@ -245,12 +252,12 @@ impl Alphabet {
245
252
}
246
253
247
254
/// Canonical (unambiguous) characters (e.g. 'A', 'C', 'G', 'T' in nuc alphabet)
248
- pub fn canonical ( & self ) -> impl Iterator < Item = u8 > + ' _ {
255
+ pub fn canonical ( & self ) -> impl Iterator < Item = AsciiChar > + ' _ {
249
256
self . canonical . iter ( )
250
257
}
251
258
252
259
/// Check is character is canonical
253
- pub fn is_canonical ( & self , c : u8 ) -> bool {
260
+ pub fn is_canonical ( & self , c : AsciiChar ) -> bool {
254
261
self . canonical . contains ( c)
255
262
}
256
263
@@ -259,12 +266,12 @@ impl Alphabet {
259
266
}
260
267
261
268
/// Ambiguous characters (e.g. 'R', 'S' etc. in nuc alphabet)
262
- pub fn ambiguous ( & self ) -> impl Iterator < Item = u8 > + ' _ {
269
+ pub fn ambiguous ( & self ) -> impl Iterator < Item = AsciiChar > + ' _ {
263
270
self . ambiguous_keys . iter ( )
264
271
}
265
272
266
273
/// Check if character is ambiguous (e.g. 'R', 'S' etc. in nuc alphabet)
267
- pub fn is_ambiguous ( & self , c : u8 ) -> bool {
274
+ pub fn is_ambiguous ( & self , c : AsciiChar ) -> bool {
268
275
self . ambiguous_keys . contains ( c)
269
276
}
270
277
@@ -273,11 +280,11 @@ impl Alphabet {
273
280
}
274
281
275
282
/// Determined characters: canonical or ambiguous
276
- pub fn determined ( & self ) -> impl Iterator < Item = u8 > + ' _ {
283
+ pub fn determined ( & self ) -> impl Iterator < Item = AsciiChar > + ' _ {
277
284
self . determined . iter ( )
278
285
}
279
286
280
- pub fn is_determined ( & self , c : u8 ) -> bool {
287
+ pub fn is_determined ( & self , c : AsciiChar ) -> bool {
281
288
self . determined . contains ( c)
282
289
}
283
290
@@ -286,11 +293,11 @@ impl Alphabet {
286
293
}
287
294
288
295
/// Undetermined characters: gap or unknown
289
- pub fn undetermined ( & self ) -> impl Iterator < Item = u8 > + ' _ {
296
+ pub fn undetermined ( & self ) -> impl Iterator < Item = AsciiChar > + ' _ {
290
297
self . undetermined . iter ( )
291
298
}
292
299
293
- pub fn is_undetermined ( & self , c : u8 ) -> bool {
300
+ pub fn is_undetermined ( & self , c : AsciiChar ) -> bool {
294
301
self . undetermined . contains ( c)
295
302
}
296
303
@@ -299,23 +306,23 @@ impl Alphabet {
299
306
}
300
307
301
308
/// Get 'unknown' character
302
- pub fn unknown ( & self ) -> u8 {
309
+ pub fn unknown ( & self ) -> AsciiChar {
303
310
self . unknown
304
311
}
305
312
306
313
/// Check if character is an 'unknown' character
307
- pub fn is_unknown ( & self , c : u8 ) -> bool {
308
- c == self . unknown ( )
314
+ pub fn is_unknown ( & self , c : impl Into < AsciiChar > ) -> bool {
315
+ c. into ( ) == self . unknown ( )
309
316
}
310
317
311
318
/// Get 'gap' character
312
- pub fn gap ( & self ) -> u8 {
319
+ pub fn gap ( & self ) -> AsciiChar {
313
320
self . gap
314
321
}
315
322
316
323
/// Check if character is a gap
317
- pub fn is_gap ( & self , c : u8 ) -> bool {
318
- c == self . gap ( )
324
+ pub fn is_gap ( & self , c : impl Into < AsciiChar > ) -> bool {
325
+ c. into ( ) == self . gap ( )
319
326
}
320
327
}
321
328
@@ -338,6 +345,9 @@ impl AlphabetConfig {
338
345
treat_gap_as_unknown,
339
346
} = self ;
340
347
348
+ let gap = AsciiChar :: from ( * gap) ;
349
+ let unknown = AsciiChar :: from ( * unknown) ;
350
+
341
351
self
342
352
. validate ( )
343
353
. wrap_err ( "When validating alphabet config" )
@@ -353,11 +363,11 @@ impl AlphabetConfig {
353
363
let mut profile_map: ProfileMap = canonical
354
364
. iter ( )
355
365
. zip ( eye. rows ( ) )
356
- . map ( |( s, x) | ( * s , x. to_owned ( ) ) )
366
+ . map ( |( s, x) | ( AsciiChar ( * s ) , x. to_owned ( ) ) )
357
367
. collect ( ) ;
358
368
359
369
// Add unknown to profile map
360
- profile_map. insert ( * unknown, Array1 :: < f64 > :: ones ( canonical. len ( ) ) ) ;
370
+ profile_map. insert ( unknown, Array1 :: < f64 > :: ones ( canonical. len ( ) ) ) ;
361
371
362
372
// Add ambiguous to profile map
363
373
ambiguous. iter ( ) . for_each ( |( & key, values) | {
@@ -366,12 +376,12 @@ impl AlphabetConfig {
366
376
. enumerate ( )
367
377
. map ( |( i, c) | if values. contains ( c) { 1.0 } else { 0.0 } )
368
378
. collect :: < Array1 < f64 > > ( ) ;
369
- profile_map. insert ( key, profile) ;
379
+ profile_map. insert ( AsciiChar ( key) , profile) ;
370
380
} ) ;
371
381
372
382
if * treat_gap_as_unknown {
373
383
// Add gap to profile map
374
- profile_map. insert ( * gap, profile_map[ unknown] . clone ( ) ) ;
384
+ profile_map. insert ( gap, profile_map[ & unknown] . clone ( ) ) ;
375
385
}
376
386
377
387
Ok ( profile_map)
@@ -397,7 +407,7 @@ impl AlphabetConfig {
397
407
. collect_vec ( ) ;
398
408
399
409
for reserved in [ NON_CHAR , VARIABLE_CHAR , FILL_CHAR ] {
400
- if all. iter ( ) . any ( |& c| c == reserved) {
410
+ if all. iter ( ) . any ( |& c| c == u8 :: from ( reserved) ) {
401
411
return make_error ! ( "Alphabet contains reserved character: {reserved}" ) ;
402
412
}
403
413
}
0 commit comments