@@ -53,13 +53,13 @@ def abstract_inline_conversion(markup_fn):
53
53
the text if it looks like an HTML tag. markup_fn is necessary to allow for
54
54
references to self.strong_em_symbol etc.
55
55
"""
56
- def implementation (self , el , text , convert_as_inline ):
56
+ def implementation (self , el , text , parent_tags ):
57
57
markup_prefix = markup_fn (self )
58
58
if markup_prefix .startswith ('<' ) and markup_prefix .endswith ('>' ):
59
59
markup_suffix = '</' + markup_prefix [1 :]
60
60
else :
61
61
markup_suffix = markup_prefix
62
- if el . find_parent ([ 'pre' , 'code' , 'kbd' , 'samp' ]) :
62
+ if '_noformat' in parent_tags :
63
63
return text
64
64
prefix , suffix , text = chomp (text )
65
65
if not text :
@@ -166,18 +166,13 @@ def convert(self, html):
166
166
return self .convert_soup (soup )
167
167
168
168
def convert_soup (self , soup ):
169
- return self .process_tag (soup , convert_as_inline = False )
169
+ return self .process_tag (soup , parent_tags = set () )
170
170
171
- def process_tag (self , node , convert_as_inline ):
172
- text = ''
171
+ def process_tag (self , node , parent_tags = None ):
172
+ if parent_tags is None :
173
+ parent_tags = set ()
173
174
174
- # For Markdown headings and table cells, convert children as inline
175
- # (so that block element children do not produce newlines).
176
- convert_children_as_inline = (
177
- convert_as_inline # propagated from parent
178
- or html_heading_re .match (node .name ) is not None # headings
179
- or node .name in ['td' , 'th' ] # table cells
180
- )
175
+ text = ''
181
176
182
177
# Collect child elements to process, ignoring whitespace-only text elements
183
178
# adjacent to the inner/outer boundaries of block elements.
@@ -208,28 +203,48 @@ def _can_ignore(el):
208
203
209
204
children_to_convert = [child for child in node .children if not _can_ignore (child )]
210
205
206
+ node_name = node .name
207
+
211
208
# Convert the children first
212
- for el in children_to_convert :
213
- if isinstance (el , NavigableString ):
214
- text += self .process_text (el )
215
- else :
216
- text_strip = text .rstrip ('\n ' )
217
- newlines_left = len (text ) - len (text_strip )
218
- next_text = self .process_tag (el , convert_children_as_inline )
219
- next_text_strip = next_text .lstrip ('\n ' )
220
- newlines_right = len (next_text ) - len (next_text_strip )
221
- newlines = '\n ' * max (newlines_left , newlines_right )
222
- text = text_strip + newlines + next_text_strip
209
+ if children_to_convert :
210
+ # for children tags, start with a copy of the parent tag set
211
+ parent_tags_for_children = set (parent_tags )
212
+
213
+ # add this tag's name as a parent
214
+ parent_tags_for_children .add (node_name )
215
+
216
+ # if this tag is a heading or table cell, add an '_inline' parent pseudo-tag
217
+ if (
218
+ html_heading_re .match (node_name ) is not None # headings
219
+ or node_name in {'td' , 'th' } # table cells
220
+ ):
221
+ parent_tags_for_children .add ('_inline' )
222
+
223
+ # if this tag is a preformatted element, add a '_noformat' parent pseudo-tag
224
+ if node_name in {'pre' , 'code' , 'kbd' , 'samp' }:
225
+ parent_tags_for_children .add ('_noformat' )
226
+
227
+ for el in children_to_convert :
228
+ if isinstance (el , NavigableString ):
229
+ text += self .process_text (el , parent_tags = parent_tags_for_children )
230
+ else :
231
+ text_strip = text .rstrip ('\n ' )
232
+ newlines_left = len (text ) - len (text_strip )
233
+ next_text = self .process_tag (el , parent_tags = parent_tags_for_children )
234
+ next_text_strip = next_text .lstrip ('\n ' )
235
+ newlines_right = len (next_text ) - len (next_text_strip )
236
+ newlines = '\n ' * max (newlines_left , newlines_right )
237
+ text = text_strip + newlines + next_text_strip
223
238
224
239
# apply this tag's final conversion function
225
- convert_fn_name = "convert_%s" % re .sub (r"[\[\]:-]" , "_" , node . name )
240
+ convert_fn_name = "convert_%s" % re .sub (r"[\[\]:-]" , "_" , node_name )
226
241
convert_fn = getattr (self , convert_fn_name , None )
227
- if convert_fn and self .should_convert_tag (node . name ):
228
- text = convert_fn (node , text , convert_as_inline )
242
+ if convert_fn and self .should_convert_tag (node_name ):
243
+ text = convert_fn (node , text , parent_tags = parent_tags )
229
244
230
245
return text
231
246
232
- def convert__document_ (self , el , text , convert_as_inline ):
247
+ def convert__document_ (self , el , text , parent_tags ):
233
248
"""Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""
234
249
if self .options ['strip_document' ] == LSTRIP :
235
250
text = text .lstrip ('\n ' ) # remove leading separation newlines
@@ -244,19 +259,19 @@ def convert__document_(self, el, text, convert_as_inline):
244
259
245
260
return text
246
261
247
- def process_text (self , el ):
262
+ def process_text (self , el , parent_tags ):
248
263
text = six .text_type (el ) or ''
249
264
250
265
# normalize whitespace if we're not inside a preformatted element
251
- if not el . find_parent ( 'pre' ) :
266
+ if 'pre' not in parent_tags :
252
267
if self .options ['wrap' ]:
253
268
text = all_whitespace_re .sub (' ' , text )
254
269
else :
255
270
text = newline_whitespace_re .sub ('\n ' , text )
256
271
text = whitespace_re .sub (' ' , text )
257
272
258
273
# escape special characters if we're not inside a preformatted or code element
259
- if not el . find_parent ([ 'pre' , 'code' , 'kbd' , 'samp' ]) :
274
+ if '_noformat' not in parent_tags :
260
275
text = self .escape (text )
261
276
262
277
# remove leading whitespace at the start or just after a
@@ -279,8 +294,8 @@ def __getattr__(self, attr):
279
294
if m :
280
295
n = int (m .group (1 ))
281
296
282
- def convert_tag (el , text , convert_as_inline ):
283
- return self ._convert_hn (n , el , text , convert_as_inline )
297
+ def convert_tag (el , text , parent_tags ):
298
+ return self ._convert_hn (n , el , text , parent_tags )
284
299
285
300
convert_tag .__name__ = 'convert_h%s' % n
286
301
setattr (self , convert_tag .__name__ , convert_tag )
@@ -327,8 +342,8 @@ def underline(self, text, pad_char):
327
342
text = (text or '' ).rstrip ()
328
343
return '\n \n %s\n %s\n \n ' % (text , pad_char * len (text )) if text else ''
329
344
330
- def convert_a (self , el , text , convert_as_inline ):
331
- if el . find_parent ([ 'pre' , 'code' , 'kbd' , 'samp' ]) :
345
+ def convert_a (self , el , text , parent_tags ):
346
+ if '_noformat' in parent_tags :
332
347
return text
333
348
prefix , suffix , text = chomp (text )
334
349
if not text :
@@ -349,10 +364,10 @@ def convert_a(self, el, text, convert_as_inline):
349
364
350
365
convert_b = abstract_inline_conversion (lambda self : 2 * self .options ['strong_em_symbol' ])
351
366
352
- def convert_blockquote (self , el , text , convert_as_inline ):
367
+ def convert_blockquote (self , el , text , parent_tags ):
353
368
# handle some early-exit scenarios
354
369
text = (text or '' ).strip ()
355
- if convert_as_inline :
370
+ if '_inline' in parent_tags :
356
371
return ' ' + text + ' '
357
372
if not text :
358
373
return "\n "
@@ -365,25 +380,25 @@ def _indent_for_blockquote(match):
365
380
366
381
return '\n ' + text + '\n \n '
367
382
368
- def convert_br (self , el , text , convert_as_inline ):
369
- if convert_as_inline :
383
+ def convert_br (self , el , text , parent_tags ):
384
+ if '_inline' in parent_tags :
370
385
return ""
371
386
372
387
if self .options ['newline_style' ].lower () == BACKSLASH :
373
388
return '\\ \n '
374
389
else :
375
390
return ' \n '
376
391
377
- def convert_code (self , el , text , convert_as_inline ):
378
- if el . parent . name == 'pre' :
392
+ def convert_code (self , el , text , parent_tags ):
393
+ if 'pre' in parent_tags :
379
394
return text
380
395
converter = abstract_inline_conversion (lambda self : '`' )
381
- return converter (self , el , text , convert_as_inline )
396
+ return converter (self , el , text , parent_tags )
382
397
383
398
convert_del = abstract_inline_conversion (lambda self : '~~' )
384
399
385
- def convert_div (self , el , text , convert_as_inline ):
386
- if convert_as_inline :
400
+ def convert_div (self , el , text , parent_tags ):
401
+ if '_inline' in parent_tags :
387
402
return ' ' + text .strip () + ' '
388
403
text = text .strip ()
389
404
return '\n \n %s\n \n ' % text if text else ''
@@ -396,9 +411,9 @@ def convert_div(self, el, text, convert_as_inline):
396
411
397
412
convert_kbd = convert_code
398
413
399
- def convert_dd (self , el , text , convert_as_inline ):
414
+ def convert_dd (self , el , text , parent_tags ):
400
415
text = (text or '' ).strip ()
401
- if convert_as_inline :
416
+ if '_inline' in parent_tags :
402
417
return ' ' + text + ' '
403
418
if not text :
404
419
return '\n '
@@ -414,11 +429,11 @@ def _indent_for_dd(match):
414
429
415
430
return '%s\n ' % text
416
431
417
- def convert_dt (self , el , text , convert_as_inline ):
432
+ def convert_dt (self , el , text , parent_tags ):
418
433
# remove newlines from term text
419
434
text = (text or '' ).strip ()
420
435
text = all_whitespace_re .sub (' ' , text )
421
- if convert_as_inline :
436
+ if '_inline' in parent_tags :
422
437
return ' ' + text + ' '
423
438
if not text :
424
439
return '\n '
@@ -428,9 +443,9 @@ def convert_dt(self, el, text, convert_as_inline):
428
443
429
444
return '\n %s\n ' % text
430
445
431
- def _convert_hn (self , n , el , text , convert_as_inline ):
446
+ def _convert_hn (self , n , el , text , parent_tags ):
432
447
""" Method name prefixed with _ to prevent <hn> to call this """
433
- if convert_as_inline :
448
+ if '_inline' in parent_tags :
434
449
return text
435
450
436
451
# prevent MemoryErrors in case of very large n
@@ -447,46 +462,41 @@ def _convert_hn(self, n, el, text, convert_as_inline):
447
462
return '\n \n %s %s %s\n \n ' % (hashes , text , hashes )
448
463
return '\n \n %s %s\n \n ' % (hashes , text )
449
464
450
- def convert_hr (self , el , text , convert_as_inline ):
465
+ def convert_hr (self , el , text , parent_tags ):
451
466
return '\n \n ---\n \n '
452
467
453
468
convert_i = convert_em
454
469
455
- def convert_img (self , el , text , convert_as_inline ):
470
+ def convert_img (self , el , text , parent_tags ):
456
471
alt = el .attrs .get ('alt' , None ) or ''
457
472
src = el .attrs .get ('src' , None ) or ''
458
473
title = el .attrs .get ('title' , None ) or ''
459
474
title_part = ' "%s"' % title .replace ('"' , r'\"' ) if title else ''
460
- if (convert_as_inline
475
+ if ('_inline' in parent_tags
461
476
and el .parent .name not in self .options ['keep_inline_images_in' ]):
462
477
return alt
463
478
464
479
return '' % (alt , src , title_part )
465
480
466
- def convert_list (self , el , text , convert_as_inline ):
481
+ def convert_list (self , el , text , parent_tags ):
467
482
468
483
# Converting a list to inline is undefined.
469
- # Ignoring convert_to_inline for list.
484
+ # Ignoring inline conversion parents for list.
470
485
471
486
nested = False
472
487
before_paragraph = False
473
488
next_sibling = _next_block_content_sibling (el )
474
489
if next_sibling and next_sibling .name not in ['ul' , 'ol' ]:
475
490
before_paragraph = True
476
- while el :
477
- if el .name == 'li' :
478
- nested = True
479
- break
480
- el = el .parent
481
- if nested :
482
- # remove trailing newline if nested
491
+ if 'li' in parent_tags :
492
+ # remove trailing newline if we're in a nested list
483
493
return '\n ' + text .rstrip ()
484
494
return '\n \n ' + text + ('\n ' if before_paragraph else '' )
485
495
486
496
convert_ul = convert_list
487
497
convert_ol = convert_list
488
498
489
- def convert_li (self , el , text , convert_as_inline ):
499
+ def convert_li (self , el , text , parent_tags ):
490
500
# handle some early-exit scenarios
491
501
text = (text or '' ).strip ()
492
502
if not text :
@@ -523,8 +533,8 @@ def _indent_for_li(match):
523
533
524
534
return '%s\n ' % text
525
535
526
- def convert_p (self , el , text , convert_as_inline ):
527
- if convert_as_inline :
536
+ def convert_p (self , el , text , parent_tags ):
537
+ if '_inline' in parent_tags :
528
538
return ' ' + text .strip () + ' '
529
539
text = text .strip ()
530
540
if self .options ['wrap' ]:
@@ -546,7 +556,7 @@ def convert_p(self, el, text, convert_as_inline):
546
556
text = '\n ' .join (new_lines )
547
557
return '\n \n %s\n \n ' % text if text else ''
548
558
549
- def convert_pre (self , el , text , convert_as_inline ):
559
+ def convert_pre (self , el , text , parent_tags ):
550
560
if not text :
551
561
return ''
552
562
code_language = self .options ['code_language' ]
@@ -556,10 +566,10 @@ def convert_pre(self, el, text, convert_as_inline):
556
566
557
567
return '\n \n ```%s\n %s\n ```\n \n ' % (code_language , text )
558
568
559
- def convert_script (self , el , text , convert_as_inline ):
569
+ def convert_script (self , el , text , parent_tags ):
560
570
return ''
561
571
562
- def convert_style (self , el , text , convert_as_inline ):
572
+ def convert_style (self , el , text , parent_tags ):
563
573
return ''
564
574
565
575
convert_s = convert_del
@@ -572,28 +582,28 @@ def convert_style(self, el, text, convert_as_inline):
572
582
573
583
convert_sup = abstract_inline_conversion (lambda self : self .options ['sup_symbol' ])
574
584
575
- def convert_table (self , el , text , convert_as_inline ):
585
+ def convert_table (self , el , text , parent_tags ):
576
586
return '\n \n ' + text .strip () + '\n \n '
577
587
578
- def convert_caption (self , el , text , convert_as_inline ):
588
+ def convert_caption (self , el , text , parent_tags ):
579
589
return text .strip () + '\n \n '
580
590
581
- def convert_figcaption (self , el , text , convert_as_inline ):
591
+ def convert_figcaption (self , el , text , parent_tags ):
582
592
return '\n \n ' + text .strip () + '\n \n '
583
593
584
- def convert_td (self , el , text , convert_as_inline ):
594
+ def convert_td (self , el , text , parent_tags ):
585
595
colspan = 1
586
596
if 'colspan' in el .attrs and el ['colspan' ].isdigit ():
587
597
colspan = int (el ['colspan' ])
588
598
return ' ' + text .strip ().replace ("\n " , " " ) + ' |' * colspan
589
599
590
- def convert_th (self , el , text , convert_as_inline ):
600
+ def convert_th (self , el , text , parent_tags ):
591
601
colspan = 1
592
602
if 'colspan' in el .attrs and el ['colspan' ].isdigit ():
593
603
colspan = int (el ['colspan' ])
594
604
return ' ' + text .strip ().replace ("\n " , " " ) + ' |' * colspan
595
605
596
- def convert_tr (self , el , text , convert_as_inline ):
606
+ def convert_tr (self , el , text , parent_tags ):
597
607
cells = el .find_all (['td' , 'th' ])
598
608
is_first_row = el .find_previous_sibling () is None
599
609
is_headrow = (
0 commit comments