81
81
import math
82
82
import re
83
83
import textwrap
84
- from collections import defaultdict , deque
84
+ from collections import defaultdict , deque , OrderedDict
85
85
from copy import deepcopy
86
86
87
87
from pycaption .base import BaseReader , BaseWriter , CaptionNode , CaptionSet
@@ -198,6 +198,31 @@ def __init__(self, *args, **kw):
198
198
199
199
self .time = 0
200
200
201
+ def _group_captions_by_start_time (self , caps ):
202
+ # group captions that have the same start time
203
+ caps_start_time = OrderedDict ()
204
+ for i , cap in enumerate (caps ):
205
+ if cap .start not in caps_start_time :
206
+ caps_start_time [cap .start ] = [cap ]
207
+ else :
208
+ caps_start_time [cap .start ].append (cap )
209
+ # order by start timestamp
210
+ caps_start_time = OrderedDict (sorted (caps_start_time .items (), key = lambda item : item [0 ]))
211
+
212
+ # check if captions with the same start time also have the same end time
213
+ # fail if different end times are found - this is not (yet?) supported
214
+ caps_final = []
215
+ for start_time , caps_list in caps_start_time .items ():
216
+ if len (caps_list ) == 1 :
217
+ caps_final .append (caps_list )
218
+ else :
219
+ end_times = list (set ([c .end for c in caps_list ]))
220
+ if len (end_times ) != 1 :
221
+ raise ValueError ("Unsupported subtitles - overlapping subtitles with different end times found" )
222
+ else :
223
+ caps_final .append (caps_list )
224
+ return caps_final
225
+
201
226
def detect (self , content ):
202
227
"""Checks whether the given content is a proper SCC file
203
228
@@ -211,7 +236,7 @@ def detect(self, content):
211
236
else :
212
237
return False
213
238
214
- def read (self , content , lang = "en-US" , simulate_roll_up = False , offset = 0 ):
239
+ def read (self , content , lang = "en-US" , simulate_roll_up = False , offset = 0 , merge_captions = False ):
215
240
"""Converts the unicode string into a CaptionSet
216
241
217
242
:type content: str
@@ -228,6 +253,11 @@ def read(self, content, lang="en-US", simulate_roll_up=False, offset=0):
228
253
:type offset: int
229
254
:param offset:
230
255
256
+ :type merge_captions: bool
257
+ :param merge_captions: If True, we will merge captions that have the same
258
+ start and end time. We do this by merging their nodes together, separating
259
+ them with a line break.
260
+
231
261
:rtype: CaptionSet
232
262
"""
233
263
if not isinstance (content , str ):
@@ -244,7 +274,21 @@ def read(self, content, lang="en-US", simulate_roll_up=False, offset=0):
244
274
245
275
self ._flush_implicit_buffers (self .buffer_dict .active_key )
246
276
247
- captions = CaptionSet ({lang : self .caption_stash .get_all ()})
277
+ captions_raw = self .caption_stash .get_all ()
278
+ if merge_captions :
279
+ _captions_by_start = self ._group_captions_by_start_time (captions_raw )
280
+
281
+ all_captions_with_same_time = [l for l in _captions_by_start if len (l ) > 1 ]
282
+ for current_captions_with_same_time in all_captions_with_same_time :
283
+ nodes_to_append = [CaptionNode (CaptionNode .BREAK )]
284
+ for dupe_caption in current_captions_with_same_time [1 :]:
285
+ nodes_to_append .extend (dupe_caption .nodes )
286
+ nodes_to_append .append (CaptionNode (CaptionNode .BREAK ))
287
+ captions_raw .remove (dupe_caption )
288
+
289
+ current_captions_with_same_time [0 ].nodes .extend (nodes_to_append )
290
+
291
+ captions = CaptionSet ({lang : captions_raw })
248
292
249
293
# check captions for incorrect lengths
250
294
lines_too_long = defaultdict (list )
0 commit comments