@@ -243,35 +243,56 @@ impl Count {
243
243
min_fraction : f64 ,
244
244
) -> Result < PrefixInfo > {
245
245
let guide_length = library. guide_length ;
246
- let mut prefix_lengths = vec ! [ 0u64 ; 500 ] ;
246
+ let mut prefix_lengths = vec ! [ 0f64 ; 500 ] ;
247
247
let mut count = 0u64 ;
248
248
249
249
// Parse the first `sample_size` records to find exact match guides and
250
250
// extract the sequence that precedes the guide
251
251
parse_path ( Some ( fastq) , |parser| {
252
- parser
253
- . each ( |rec| {
254
- let read_bases = rec. seq ( ) ;
255
- let read_length = read_bases. len ( ) ;
252
+ let mut read_offsets = Vec :: < usize > :: with_capacity ( 5 ) ;
256
253
257
- if read_length >= guide_length {
258
- for trim in 0 ..= ( read_length - guide_length ) {
259
- let bases = & read_bases[ trim..trim + guide_length ] ;
254
+ parser . each ( |rec| {
255
+ let read_bases = rec . seq ( ) ;
256
+ let read_length = read_bases. len ( ) ;
260
257
261
- if lookup. contains_key ( bases) {
262
- prefix_lengths[ trim] += 1 ;
263
- }
258
+ if read_length >= guide_length {
259
+ for trim in 0 ..=( read_length - guide_length) {
260
+ let bases = & read_bases[ trim..trim + guide_length] ;
261
+
262
+ if lookup. contains_key ( bases) {
263
+ read_offsets. push ( trim) ;
264
264
}
265
265
}
266
+ }
267
+
268
+ // If the read only matched at a single offset, just use it; otherwise prefer
269
+ // match(es) that are perfect matches and allocation proportionally.
270
+ if read_offsets. len ( ) == 1 {
271
+ prefix_lengths[ read_offsets[ 0 ] ] += 1.0 ;
272
+ }
273
+ else if read_offsets. len ( ) > 1 {
274
+ let perfect_match_offsets = read_offsets. iter ( ) . copied ( ) . filter ( |& off| {
275
+ let bases = & read_bases[ off..off + guide_length] ;
276
+ let guide = lookup. get ( bases) . unwrap ( ) ;
277
+ guide. bases . as_slice ( ) == bases
278
+ } ) . collect_vec ( ) ;
279
+
280
+ let preferred_offsets = if perfect_match_offsets. is_empty ( ) { & read_offsets } else { & perfect_match_offsets } ;
281
+ let addend = 1f64 / preferred_offsets. len ( ) as f64 ;
282
+ for offset in preferred_offsets. iter ( ) . copied ( ) {
283
+ prefix_lengths[ offset] += addend;
284
+ }
285
+ }
266
286
267
- count += 1 ;
268
- count < sample_size
269
- } )
270
- . expect ( "Failed to parse." ) ;
287
+ read_offsets. clear ( ) ;
288
+ count += 1 ;
289
+ count < sample_size
290
+ } )
291
+ . expect ( "Failed to parse." ) ;
271
292
} )
272
293
. context ( format ! ( "Failed to read {:?}" , fastq) ) ?;
273
294
274
- let total_matched: u64 = prefix_lengths. iter ( ) . sum ( ) ;
295
+ let total_matched: f64 = prefix_lengths. iter ( ) . sum ( ) ;
275
296
let fraction_matched = total_matched as f64 / count as f64 ;
276
297
info ! (
277
298
"In {:?} examined {} reads for guide start position and matched {} ({:.4})." ,
@@ -280,7 +301,7 @@ impl Count {
280
301
281
302
// Tuple of offset -> count where count is > 0
282
303
let non_zeros =
283
- prefix_lengths. iter ( ) . copied ( ) . enumerate ( ) . filter ( |( _idx, n) | * n > 0 ) . collect_vec ( ) ;
304
+ prefix_lengths. iter ( ) . copied ( ) . enumerate ( ) . filter ( |( _idx, n) | * n > 0.0 ) . collect_vec ( ) ;
284
305
285
306
info ! (
286
307
"{} read offsets: {}" ,
0 commit comments