Skip to content

Commit 19bddbf

Browse files
committed
Contig stitcher: base drawing only on the parent-child relationship
1 parent 0b8bac5 commit 19bddbf

File tree

1 file changed

+101
-76
lines changed

1 file changed

+101
-76
lines changed

micall/core/plot_contigs.py

Lines changed: 101 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -405,9 +405,10 @@ def plot_stitcher_coverage(logs: Iterable[events.EventType], genome_coverage_svg
405405

406406

407407
def build_stitcher_figure(logs: Iterable[events.EventType]) -> Figure:
408-
contig_map: Dict[str, GenotypedContig] = {}
408+
complete_contig_map: Dict[str, GenotypedContig] = {}
409409
name_mappings: Dict[str, str] = {}
410-
parent_graph: Dict[str, List[str]] = {}
410+
complete_parent_graph: Dict[str, List[str]] = {}
411+
alive_set: Set[str] = set()
411412
morphism_graph: Dict[str, List[str]] = {}
412413
reduced_parent_graph: Dict[str, List[str]] = {}
413414
transitive_parent_graph: Dict[str, List[str]] = {}
@@ -420,12 +421,9 @@ def build_stitcher_figure(logs: Iterable[events.EventType]) -> Figure:
420421
overlap_lefttake_map: Dict[str, str] = {}
421422
overlap_righttake_map: Dict[str, str] = {}
422423
overlap_sibling_map: Dict[str, str] = {}
423-
combine_list: List[str] = []
424424
combine_left_edge: Dict[str, str] = {}
425425
combine_right_edge: Dict[str, str] = {}
426-
temporary: Set[str] = set()
427426
children_join_points: List[str] = []
428-
children_meet_points: List[str] = []
429427
query_position_map: Dict[str, int] = {}
430428
initial_alignments: Dict[str, List[CigarHit]] = {}
431429

@@ -439,6 +437,18 @@ def remove_intermediate_edges(graph):
439437
ret[parent] = lst
440438
return ret
441439

440+
def remove_transitive_edges(graph):
441+
tr_cl = transitive_closure(graph)
442+
ret = {}
443+
for parent, children in graph.items():
444+
lst = []
445+
for child in children:
446+
is_transitive = any(child in tr_cl.get(other_node, []) for other_node in children if other_node != child)
447+
if not is_transitive:
448+
lst.append(child)
449+
ret[parent] = lst
450+
return ret
451+
442452
def get_all_ancestors(recur, lst, graph, ancestor_name):
443453
if ancestor_name not in recur:
444454
recur = recur.copy()
@@ -460,9 +470,15 @@ def transitive_closure(graph):
460470
ret[parent] = lst
461471
return ret
462472

463-
def reflexive_closure(graph):
464-
ret = graph.copy()
473+
def copy_graph(graph):
474+
ret = {}
465475
for parent, children in graph.items():
476+
ret[parent] = children[:]
477+
return ret
478+
479+
def reflexive_closure(graph):
480+
ret = copy_graph(graph)
481+
for parent, children in ret.items():
466482
if parent not in children:
467483
children.append(parent)
468484
for child in children[:]:
@@ -483,7 +499,7 @@ def inverse_graph(graph):
483499
return ret
484500

485501
def graph_sum(graph_a, graph_b):
486-
ret = graph_a.copy()
502+
ret = copy_graph(graph_a)
487503
for key, values in graph_b.items():
488504
if key not in ret:
489505
ret[key] = []
@@ -497,52 +513,56 @@ def symmetric_closure(graph):
497513
return graph_sum(graph, inverse_graph(graph))
498514

499515
def record_contig(contig: GenotypedContig, parents: List[GenotypedContig]):
500-
contig_map[contig.name] = contig
516+
complete_contig_map[contig.name] = contig
501517
if [contig.name] != [parent.name for parent in parents]:
502518
for parent in parents:
503-
contig_map[parent.name] = parent
504-
if contig.name not in parent_graph:
505-
parent_graph[contig.name] = []
519+
complete_contig_map[parent.name] = parent
520+
if contig.name not in complete_parent_graph:
521+
complete_parent_graph[contig.name] = []
506522

507-
parent_graph[contig.name].append(parent.name)
523+
complete_parent_graph[contig.name].append(parent.name)
508524

509-
def record_morphism(contig: Contig, original: Contig):
510-
if original.name not in morphism_graph:
511-
morphism_graph[original.name] = []
512-
lst = morphism_graph[original.name]
513-
if contig.name not in lst:
514-
lst.append(contig.name)
525+
def record_alive(contig: Contig):
526+
alive_set.add(contig.name)
515527

516528
def record_bad_contig(contig: GenotypedContig, lst: List[str]):
517-
contig_map[contig.name] = contig
529+
complete_contig_map[contig.name] = contig
518530
lst.append(contig.name)
519531

520-
521532
for event in logs:
522533
if isinstance(event, events.FinalCombine):
523534
record_contig(event.result, event.contigs)
535+
record_alive(event.result)
524536
elif isinstance(event, events.SplitGap):
525537
record_contig(event.left, [event.contig])
526538
record_contig(event.right, [event.contig])
539+
record_alive(event.left)
540+
record_alive(event.right)
527541
elif isinstance(event, events.Intro):
528542
record_contig(event.contig, [])
543+
record_alive(event.contig)
529544
elif isinstance(event, events.Hit):
530545
record_contig(event.part, [event.contig])
546+
record_alive(event.part)
531547
elif isinstance(event, events.NoRef):
532548
record_bad_contig(event.contig, unknown)
549+
record_alive(event.contig)
533550
elif isinstance(event, events.ZeroHits):
534551
record_bad_contig(event.contig, anomaly)
552+
record_alive(event.contig)
535553
elif isinstance(event, events.StrandConflict):
536554
record_bad_contig(event.contig, anomaly)
555+
record_alive(event.contig)
537556
elif isinstance(event, events.ReverseComplement):
538557
record_contig(event.result, [event.contig])
558+
record_alive(event.result)
539559
elif isinstance(event, events.HitNumber):
540560
initial_alignments[event.contig.name] = event.connected
561+
record_alive(event.contig)
541562
elif isinstance(event, events.Munge):
542563
record_contig(event.result, [event.left, event.right])
543564
elif isinstance(event, (events.LStrip, events.RStrip)):
544565
record_contig(event.result, [event.original])
545-
record_morphism(event.result, event.original)
546566
elif isinstance(event, events.Overlap):
547567
overlaps_list.append(event.left_overlap.name)
548568
overlaps_list.append(event.right_overlap.name)
@@ -554,20 +574,20 @@ def record_bad_contig(contig: GenotypedContig, lst: List[str]):
554574
overlap_sibling_map[event.right_remainder.name] = event.left_remainder.name
555575
elif isinstance(event, events.Drop):
556576
record_bad_contig(event.contig, discarded)
577+
record_alive(event.contig)
557578
elif isinstance(event, events.StitchCut):
558579
record_contig(event.left_overlap, [event.left])
559580
record_contig(event.left_remainder, [event.left])
560581
record_contig(event.right_overlap, [event.right])
561582
record_contig(event.right_remainder, [event.right])
562583
elif isinstance(event, events.Stitch):
563584
record_contig(event.result, [event.left, event.right])
585+
record_alive(event.result)
564586
elif isinstance(event, events.Cut):
565587
record_contig(event.left, [event.original])
566588
record_contig(event.right, [event.original])
567589
elif isinstance(event, events.Combine):
568-
for contig in event.contigs:
569-
combine_list.append(contig.name)
570-
590+
record_alive(event.result)
571591
record_contig(event.result, event.contigs)
572592
if event.contigs:
573593
combine_left_edge[event.result.name] = event.contigs[0].name
@@ -578,6 +598,26 @@ def record_bad_contig(contig: GenotypedContig, lst: List[str]):
578598
x: NoReturn = event
579599
raise RuntimeError(f"Unrecognized action or event: {event}")
580600

601+
nodup_parent_graph = remove_transitive_edges(complete_parent_graph)
602+
603+
# Close alive set by parents
604+
def extend_alive(contig_name):
605+
if contig_name not in alive_set:
606+
alive_set.add(contig_name)
607+
608+
for parent_name in nodup_parent_graph.get(contig_name, []):
609+
extend_alive(parent_name)
610+
611+
for contig_name in alive_set.copy():
612+
extend_alive(contig_name)
613+
614+
parent_graph: Dict[str, List[str]] = {}
615+
for contig_name in nodup_parent_graph:
616+
if contig_name in alive_set:
617+
parent_graph[contig_name] = nodup_parent_graph[contig_name]
618+
619+
contig_map: Dict[str, GenotypedContig] = {k: v for k, v in complete_contig_map.items() if k in alive_set}
620+
bad_contigs = anomaly + discarded + unknown
581621
group_refs = {contig.group_ref: len(contig.ref_seq) for contig in contig_map.values() if contig.ref_seq}
582622
children_graph = inverse_graph(parent_graph)
583623
transitive_parent_graph = transitive_closure(parent_graph)
@@ -590,23 +630,18 @@ def record_bad_contig(contig: GenotypedContig, lst: List[str]):
590630
sorted_sinks = list(sorted(child_name for
591631
child_name in contig_map
592632
if child_name not in children_graph))
593-
bad_contigs = anomaly + discarded + unknown
633+
634+
for contig_name, parents in parent_graph.items():
635+
if len(parents) == 1:
636+
morphism_graph[parents[0]] = [contig_name]
594637

595638
transitive_morphism_graph = transitive_closure(morphism_graph)
596639
reduced_morphism_graph = remove_intermediate_edges(transitive_morphism_graph)
597640
eqv_morphism_graph = reflexive_closure(symmetric_closure(transitive_morphism_graph))
598641

599-
for contig_name in overlaps_list:
600-
temporary.add(contig_name)
601-
for child in transitive_children_graph.get(contig_name, []):
602-
temporary.add(child)
603-
604642
for contig_name, parents in parent_graph.items():
605-
if len(parents) > 2:
643+
if len(parents) > 1:
606644
children_join_points.append(contig_name)
607-
for contig_name, children in children_graph.items():
608-
if len(children) > 2:
609-
children_meet_points.append(contig_name)
610645

611646
def hits_to_insertions(hits: List[CigarHit]):
612647
for hit in hits:
@@ -621,12 +656,6 @@ def hits_to_insertions(hits: List[CigarHit]):
621656
nonempty_insertions = [gap for gap in all_insertions if gap.query_length > 0]
622657
unaligned_map[contig_name] = nonempty_insertions
623658

624-
last_join_points_parent = {contig_name for join in children_join_points for contig_name in transitive_parent_graph.get(join, [])}
625-
last_join_points = []
626-
for contig_name in children_join_points:
627-
if contig_name not in last_join_points_parent:
628-
last_join_points.append(contig_name)
629-
630659
def set_query_position(contig: Contig):
631660
if contig.name in query_position_map:
632661
return
@@ -651,12 +680,6 @@ def set_query_position(contig: Contig):
651680
for contig in contig_map.values():
652681
set_query_position(contig)
653682

654-
# Closing `temporary'
655-
for contig_name in contig_map:
656-
if contig_name in temporary:
657-
for clone in eqv_morphism_graph.get(contig_name, [contig_name]):
658-
temporary.add(clone)
659-
660683
def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map):
661684
for parent in edge_table:
662685
child_remainder = edge_table[parent]
@@ -675,39 +698,42 @@ def copy_takes_one_side(edge_table, overlap_xtake_map, overlap_xparent_map):
675698
while list(copy_takes_one_side(combine_left_edge, overlap_righttake_map, overlap_rightparent_map)): pass
676699

677700
final_parts: Dict[str, bool] = {}
678-
for contig_name in contig_map:
679-
if contig_name in temporary:
680-
continue
701+
pre_join_points = []
681702

682-
if contig_name in combine_list:
683-
finals = reduced_morphism_graph.get(contig_name, [contig_name])
684-
if len(finals) == 1:
685-
final_parts[finals[0]] = True
703+
def add_join_parents(join_name):
704+
if join_name in children_join_points:
705+
for contig_name in parent_graph.get(join_name, [join_name]):
706+
add_join_parents(contig_name)
707+
else:
708+
pre_join_points.append(join_name)
686709

687-
elif contig_name in bad_contigs:
688-
final_parts[contig_name] = True
710+
for join_name in children_join_points + sorted_sinks:
711+
add_join_parents(join_name)
689712

690-
for join in last_join_points + sorted_sinks:
691-
parents = parent_graph.get(join, [join])
692-
if not any(isinstance(contig_map[parent], AlignedContig) for parent in parents):
693-
parents = [join]
713+
def is_ancestor(contig_name, other_names):
714+
for other in other_names:
715+
if other == contig_name:
716+
continue
694717

695-
for contig_name in parents:
696-
for contig_name in reduced_morphism_graph.get(contig_name, [contig_name]):
697-
if contig_name in bad_contigs:
698-
continue
718+
if contig_name in transitive_children_graph.get(other, []):
719+
return True
720+
return False
699721

700-
if any(contig_name in transitive_parent_graph.get(bad, []) for bad in bad_contigs):
701-
continue
722+
for contig_name in pre_join_points[:]:
723+
if is_ancestor(contig_name, pre_join_points):
724+
pre_join_points.remove(contig_name)
702725

703-
if any(eqv in temporary for eqv in eqv_morphism_graph.get(contig_name, [contig_name])):
704-
continue
726+
for contig_name in pre_join_points:
727+
if any(contig_name in transitive_parent_graph.get(bad, []) for bad in bad_contigs):
728+
continue
705729

706-
transitive_parent = eqv_parent_graph.get(contig_name, [contig_name])
707-
if any(parent in transitive_parent for parent in final_parts):
708-
continue
730+
if any(contig_name in eqv_morphism_graph.get(temp_name, [temp_name]) for temp_name in overlaps_list):
731+
continue
709732

710-
final_parts[contig_name] = True
733+
final_parts[contig_name] = True
734+
735+
for contig_name in bad_contigs:
736+
final_parts[contig_name] = True
711737

712738
final_children_mapping: Dict[str, List[str]] = {}
713739
for parent_name in sorted_roots:
@@ -838,11 +864,10 @@ def get_contig_coordinates(contig: GenotypedContig) -> Tuple[int, int, int, int]
838864
return (a_r_st, a_r_ei, f_r_st, f_r_ei)
839865

840866
def get_tracks(repeatset: Set[str], group_ref: str, contig_name: str) -> Iterable[Track]:
841-
parts = final_children_mapping[contig_name]
867+
parts_names = final_children_mapping[contig_name]
868+
parts = [contig_map[name] for name in parts_names]
842869
parts = list(sorted(parts, key=lambda part: part.alignment.r_st if isinstance(part, AlignedContig) else -1))
843-
for prev_name, part_name, next_naem in sliding_window(parts):
844-
part = contig_map[part_name]
845-
870+
for part in parts:
846871
if part.name in repeatset:
847872
continue
848873

0 commit comments

Comments
 (0)