Skip to content

Commit e57e52d

Browse files
committed
Attempt a smarter offset detection when "guides" are short and might be prone to multiple matches per read.
1 parent fa6b449 commit e57e52d

File tree

1 file changed

+38
-17
lines changed

1 file changed

+38
-17
lines changed

src/commands/count.rs

Lines changed: 38 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -243,35 +243,56 @@ impl Count {
243243
min_fraction: f64,
244244
) -> Result<PrefixInfo> {
245245
let guide_length = library.guide_length;
246-
let mut prefix_lengths = vec![0u64; 500];
246+
let mut prefix_lengths = vec![0f64; 500];
247247
let mut count = 0u64;
248248

249249
// Parse the first `sample_size` records to find exact match guides and
250250
// extract the sequence that precedes the guide
251251
parse_path(Some(fastq), |parser| {
252-
parser
253-
.each(|rec| {
254-
let read_bases = rec.seq();
255-
let read_length = read_bases.len();
252+
let mut read_offsets = Vec::<usize>::with_capacity(5);
256253

257-
if read_length >= guide_length {
258-
for trim in 0..=(read_length - guide_length) {
259-
let bases = &read_bases[trim..trim + guide_length];
254+
parser.each(|rec| {
255+
let read_bases = rec.seq();
256+
let read_length = read_bases.len();
260257

261-
if lookup.contains_key(bases) {
262-
prefix_lengths[trim] += 1;
263-
}
258+
if read_length >= guide_length {
259+
for trim in 0..=(read_length - guide_length) {
260+
let bases = &read_bases[trim..trim + guide_length];
261+
262+
if lookup.contains_key(bases) {
263+
read_offsets.push(trim);
264264
}
265265
}
266+
}
267+
268+
// If the read only matched at a single offset, just use it; otherwise prefer
269+
// match(es) that are perfect matches and allocation proportionally.
270+
if read_offsets.len() == 1 {
271+
prefix_lengths[read_offsets[0]] += 1.0;
272+
}
273+
else if read_offsets.len() > 1 {
274+
let perfect_match_offsets = read_offsets.iter().copied().filter(|&off| {
275+
let bases = &read_bases[off..off + guide_length];
276+
let guide = lookup.get(bases).unwrap();
277+
guide.bases.as_slice() == bases
278+
}).collect_vec();
279+
280+
let preferred_offsets = if perfect_match_offsets.is_empty() { &read_offsets } else { &perfect_match_offsets };
281+
let addend = 1f64 / preferred_offsets.len() as f64;
282+
for offset in preferred_offsets.iter().copied() {
283+
prefix_lengths[offset] += addend;
284+
}
285+
}
266286

267-
count += 1;
268-
count < sample_size
269-
})
270-
.expect("Failed to parse.");
287+
read_offsets.clear();
288+
count += 1;
289+
count < sample_size
290+
})
291+
.expect("Failed to parse.");
271292
})
272293
.context(format!("Failed to read {:?}", fastq))?;
273294

274-
let total_matched: u64 = prefix_lengths.iter().sum();
295+
let total_matched: f64 = prefix_lengths.iter().sum();
275296
let fraction_matched = total_matched as f64 / count as f64;
276297
info!(
277298
"In {:?} examined {} reads for guide start position and matched {} ({:.4}).",
@@ -280,7 +301,7 @@ impl Count {
280301

281302
// Tuple of offset -> count where count is > 0
282303
let non_zeros =
283-
prefix_lengths.iter().copied().enumerate().filter(|(_idx, n)| *n > 0).collect_vec();
304+
prefix_lengths.iter().copied().enumerate().filter(|(_idx, n)| *n > 0.0).collect_vec();
284305

285306
info!(
286307
"{} read offsets: {}",

0 commit comments

Comments
 (0)