8181import math
8282import re
8383import textwrap
84- from collections import defaultdict , deque
84+ from collections import defaultdict , deque , OrderedDict
8585from copy import deepcopy
8686
8787from pycaption .base import BaseReader , BaseWriter , CaptionNode , CaptionSet
@@ -198,6 +198,31 @@ def __init__(self, *args, **kw):
198198
199199 self .time = 0
200200
201+ def _group_captions_by_start_time (self , caps ):
202+ # group captions that have the same start time
203+ caps_start_time = OrderedDict ()
204+ for i , cap in enumerate (caps ):
205+ if cap .start not in caps_start_time :
206+ caps_start_time [cap .start ] = [cap ]
207+ else :
208+ caps_start_time [cap .start ].append (cap )
209+ # order by start timestamp
210+ caps_start_time = OrderedDict (sorted (caps_start_time .items (), key = lambda item : item [0 ]))
211+
212+ # check if captions with the same start time also have the same end time
213+ # fail if different end times are found - this is not (yet?) supported
214+ caps_final = []
215+ for start_time , caps_list in caps_start_time .items ():
216+ if len (caps_list ) == 1 :
217+ caps_final .append (caps_list )
218+ else :
219+ end_times = list (set ([c .end for c in caps_list ]))
220+ if len (end_times ) != 1 :
221+ raise ValueError ("Unsupported subtitles - overlapping subtitles with different end times found" )
222+ else :
223+ caps_final .append (caps_list )
224+ return caps_final
225+
201226 def detect (self , content ):
202227 """Checks whether the given content is a proper SCC file
203228
@@ -211,7 +236,7 @@ def detect(self, content):
211236 else :
212237 return False
213238
214- def read (self , content , lang = "en-US" , simulate_roll_up = False , offset = 0 ):
239+ def read (self , content , lang = "en-US" , simulate_roll_up = False , offset = 0 , merge_captions = False ):
215240 """Converts the unicode string into a CaptionSet
216241
217242 :type content: str
@@ -228,6 +253,11 @@ def read(self, content, lang="en-US", simulate_roll_up=False, offset=0):
228253 :type offset: int
229254 :param offset:
230255
256+ :type merge_captions: bool
257+ :param merge_captions: If True, we will merge captions that have the same
258+ start and end time. We do this by merging their nodes together, separating
259+ them with a line break.
260+
231261 :rtype: CaptionSet
232262 """
233263 if not isinstance (content , str ):
@@ -244,7 +274,21 @@ def read(self, content, lang="en-US", simulate_roll_up=False, offset=0):
244274
245275 self ._flush_implicit_buffers (self .buffer_dict .active_key )
246276
247- captions = CaptionSet ({lang : self .caption_stash .get_all ()})
277+ captions_raw = self .caption_stash .get_all ()
278+ if merge_captions :
279+ _captions_by_start = self ._group_captions_by_start_time (captions_raw )
280+
281+ all_captions_with_same_time = [l for l in _captions_by_start if len (l ) > 1 ]
282+ for current_captions_with_same_time in all_captions_with_same_time :
283+ nodes_to_append = [CaptionNode (CaptionNode .BREAK )]
284+ for dupe_caption in current_captions_with_same_time [1 :]:
285+ nodes_to_append .extend (dupe_caption .nodes )
286+ nodes_to_append .append (CaptionNode (CaptionNode .BREAK ))
287+ captions_raw .remove (dupe_caption )
288+
289+ current_captions_with_same_time [0 ].nodes .extend (nodes_to_append )
290+
291+ captions = CaptionSet ({lang : captions_raw })
248292
249293 # check captions for incorrect lengths
250294 lines_too_long = defaultdict (list )
0 commit comments