Skip to content

Commit a86530f

Browse files
committed
Cigar tools: fix edge cases of strip
1 parent 947b44a commit a86530f

File tree

3 files changed

+41
-27
lines changed

3 files changed

+41
-27
lines changed

Diff for: micall/core/contig_stitcher.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ def lstrip_query(self) -> 'AlignedContig':
103103
alignment. The CIGAR alignment is also updated to reflect the trimming.
104104
"""
105105

106-
alignment = self.alignment.lstrip_query()
106+
alignment = self.alignment.lstrip_query().lstrip_reference()
107107
q_remainder, query = self.cut_query(alignment.q_st - 0.5)
108108
alignment = alignment.translate(0, -1 * alignment.q_st)
109109
result = AlignedContig.make(query, alignment, self.strand)
@@ -120,7 +120,7 @@ def rstrip_query(self) -> 'AlignedContig':
120120
alignment. The CIGAR alignment is also updated to reflect the trimming.
121121
"""
122122

123-
alignment = self.alignment.rstrip_query()
123+
alignment = self.alignment.rstrip_query().rstrip_reference()
124124
query, q_remainder = self.cut_query(alignment.q_ei + 0.5)
125125
result = AlignedContig.make(query, alignment, self.strand)
126126
logger.debug("Doing rstrip of %r resulted in %r, so %s (len %s) became %s (len %s)",

Diff for: micall/tests/test_cigar_tools.py

+33-17
Original file line numberDiff line numberDiff line change
@@ -343,9 +343,11 @@ def test_cigar_hit_ref_cut_add_prop_exhaustive(hit, cut_point):
343343
('4D6I5M@1->1', '4D5M@7->1'),
344344
('4I6D5M@1->1', '6D5M@5->1'),
345345
('6I4D@1->1', '4D@7->1'),
346-
('6D4I@1->1', '6D4I@1->1'),
347-
('4D6I@1->1', '4D6I@1->1'),
346+
('6D4I@1->1', '6D@5->1'),
347+
('4D6I@1->1', '4D@7->1'),
348348
('4I6D@1->1', '6D@5->1'),
349+
('4I@1->1', '@5->1'),
350+
('4D@1->1', '4D@1->1'),
349351
('@1->1', '@1->1'),
350352
]
351353

@@ -364,9 +366,11 @@ def test_cigar_hit_ref_cut_add_prop_exhaustive(hit, cut_point):
364366
('5M6D4I@1->1', '5M6D@1->1'),
365367
('5M6I4D@1->1', '5M4D@1->1'),
366368
('6D4I@1->1', '6D@1->1'),
367-
('6I4D@1->1', '6I4D@1->1'),
368-
('4I6D@1->1', '4I6D@1->1'),
369+
('6I4D@1->1', '4D@1->1'),
370+
('4I6D@1->1', '6D@1->1'),
369371
('4D6I@1->1', '4D@1->1'),
372+
('4I@1->1', '@1->1'),
373+
('4D@1->1', '4D@1->1'),
370374
('@1->1', '@1->1'),
371375
]
372376

@@ -384,10 +388,12 @@ def test_cigar_hit_ref_cut_add_prop_exhaustive(hit, cut_point):
384388
('3I2D3I2D5M@1->1', '6I5M@1->5'),
385389
('4D6I5M@1->1', '6I5M@1->5'),
386390
('4I6D5M@1->1', '4I5M@1->7'),
387-
('6I4D@1->1', '6I4D@1->1'),
391+
('6I4D@1->1', '6I@1->5'),
388392
('6D4I@1->1', '4I@1->7'),
389393
('4D6I@1->1', '6I@1->5'),
390-
('4I6D@1->1', '4I6D@1->1'),
394+
('4I6D@1->1', '4I@1->7'),
395+
('4I@1->1', '4I@1->1'),
396+
('4D@1->1', '@1->5'),
391397
('@1->1', '@1->1'),
392398
]
393399

@@ -405,10 +411,12 @@ def test_cigar_hit_ref_cut_add_prop_exhaustive(hit, cut_point):
405411
('5M2D3I2D3I@1->1', '5M6I@1->1'),
406412
('5M6D4I@1->1', '5M4I@1->1'),
407413
('5M6I4D@1->1', '5M6I@1->1'),
408-
('6D4I@1->1', '6D4I@1->1'),
414+
('6D4I@1->1', '4I@1->1'),
409415
('6I4D@1->1', '6I@1->1'),
410416
('4I6D@1->1', '4I@1->1'),
411-
('4D6I@1->1', '4D6I@1->1'),
417+
('4D6I@1->1', '6I@1->1'),
418+
('4I@1->1', '4I@1->1'),
419+
('4D@1->1', '@1->1'),
412420
('@1->1', '@1->1'),
413421
]
414422

@@ -493,8 +501,12 @@ def test_cigar_hit_reference_strip_is_idempotent(hit):
493501
def test_cigar_hit_reference_strips_are_commutative(hit):
494502
hit = parsed_hit(hit)
495503

496-
assert hit.rstrip_reference().lstrip_reference() \
497-
== hit.lstrip_reference().rstrip_reference()
504+
if len(hit.cigar.coordinate_mapping.ref_to_query) > 0:
505+
assert hit.rstrip_reference().lstrip_reference() \
506+
== hit.lstrip_reference().rstrip_reference()
507+
else:
508+
assert hit.rstrip_reference().lstrip_reference().cigar \
509+
== hit.lstrip_reference().rstrip_reference().cigar
498510

499511

500512
@pytest.mark.parametrize('hit, expected', lstrip_query_cases)
@@ -540,13 +552,13 @@ def test_cigar_hit_query_strip_combines_with_add(hit):
540552

541553

542554
@pytest.mark.parametrize('hit', strip_prop_cases_all)
543-
def test_cigar_hit_query_strip_never_crashes(hit):
555+
def test_cigar_hit_strips_work_together(hit):
544556
hit = parsed_hit(hit)
545557

546-
hit.rstrip_query().lstrip_query()
547-
hit.lstrip_query().rstrip_query()
548-
hit.lstrip_query().lstrip_query()
549-
hit.rstrip_query().rstrip_query()
558+
rstrip = str(hit.rstrip_query().rstrip_reference().cigar)
559+
assert not rstrip.endswith("I") and not rstrip.endswith("D")
560+
lstrip = str(hit.lstrip_query().lstrip_reference().cigar)
561+
assert not lstrip.startswith("I") and not lstrip.startswith("D")
550562

551563

552564
@pytest.mark.parametrize('hit', strip_prop_cases_all)
@@ -570,8 +582,12 @@ def test_cigar_hit_query_strip_is_idempotent(hit):
570582
def test_cigar_hit_query_strips_are_commutative(hit):
571583
hit = parsed_hit(hit)
572584

573-
assert hit.rstrip_query().lstrip_query() \
574-
== hit.lstrip_query().rstrip_query()
585+
if len(hit.cigar.coordinate_mapping.ref_to_query) > 0:
586+
assert hit.rstrip_query().lstrip_query() \
587+
== hit.lstrip_query().rstrip_query()
588+
else:
589+
assert hit.rstrip_query().lstrip_query().cigar \
590+
== hit.lstrip_query().rstrip_query().cigar
575591

576592

577593
@pytest.mark.parametrize('hit, cut_point', [(x[0], x[1]) for x in cigar_hit_ref_cut_cases

Diff for: micall/utils/cigar_tools.py

+6-8
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,7 @@ def slice_operations(self, start_inclusive, end_noninclusive) -> 'Cigar':
229229
def lstrip_reference(self) -> 'Cigar':
230230
""" Return a copy of the Cigar with leading (unmatched) reference elements removed. """
231231

232-
min_r = min(self.coordinate_mapping.ref_to_query.keys(), default=0)
232+
min_r = min(self.coordinate_mapping.ref_to_query.keys(), default=None)
233233
min_op = self.coordinate_mapping.ref_to_op.get(min_r, float("inf"))
234234

235235
ops = [(1, op) for i, (op, ref_pointer, query_pointer)
@@ -241,9 +241,8 @@ def lstrip_reference(self) -> 'Cigar':
241241
def rstrip_reference(self) -> 'Cigar':
242242
""" Return a copy of the Cigar with trailing (unmatched) reference elements removed. """
243243

244-
max_r = max(self.coordinate_mapping.ref_to_query.keys(),
245-
default=len(self.coordinate_mapping.ref_to_op) - 1)
246-
max_op = self.coordinate_mapping.ref_to_op.get(max_r, float("inf"))
244+
max_r = max(self.coordinate_mapping.ref_to_query.keys(), default=None)
245+
max_op = self.coordinate_mapping.ref_to_op.get(max_r, float("-inf"))
247246

248247
ops = [(1, op) for i, (op, ref_pointer, query_pointer)
249248
in enumerate(self.iterate_operations_with_pointers())
@@ -254,7 +253,7 @@ def rstrip_reference(self) -> 'Cigar':
254253
def lstrip_query(self) -> 'Cigar':
255254
""" Return a copy of the Cigar with leading (unmatched) query elements removed. """
256255

257-
min_q = min(self.coordinate_mapping.query_to_ref.keys(), default=0)
256+
min_q = min(self.coordinate_mapping.query_to_ref.keys(), default=None)
258257
min_op = self.coordinate_mapping.query_to_op.get(min_q, float("inf"))
259258

260259
ops = [(1, op) for i, (op, ref_pointer, query_pointer)
@@ -266,9 +265,8 @@ def lstrip_query(self) -> 'Cigar':
266265
def rstrip_query(self) -> 'Cigar':
267266
""" Return a copy of the Cigar with trailing (unmatched) query elements removed. """
268267

269-
max_q = max(self.coordinate_mapping.query_to_ref.keys(),
270-
default=len(self.coordinate_mapping.query_to_op) - 1)
271-
max_op = self.coordinate_mapping.query_to_op.get(max_q, float("inf"))
268+
max_q = max(self.coordinate_mapping.query_to_ref.keys(), default=None)
269+
max_op = self.coordinate_mapping.query_to_op.get(max_q, float("-inf"))
272270

273271
ops = [(1, op) for i, (op, ref_pointer, query_pointer)
274272
in enumerate(self.iterate_operations_with_pointers())

0 commit comments

Comments
 (0)