Skip to content

Commit a5b09f1

Browse files
committed
propagate a set of parent tag names downward to improve runtime
Signed-off-by: chrispy <[email protected]>
1 parent 3026602 commit a5b09f1

File tree

2 files changed

+86
-76
lines changed

2 files changed

+86
-76
lines changed

markdownify/__init__.py

Lines changed: 83 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -53,13 +53,13 @@ def abstract_inline_conversion(markup_fn):
5353
the text if it looks like an HTML tag. markup_fn is necessary to allow for
5454
references to self.strong_em_symbol etc.
5555
"""
56-
def implementation(self, el, text, convert_as_inline):
56+
def implementation(self, el, text, parent_tags):
5757
markup_prefix = markup_fn(self)
5858
if markup_prefix.startswith('<') and markup_prefix.endswith('>'):
5959
markup_suffix = '</' + markup_prefix[1:]
6060
else:
6161
markup_suffix = markup_prefix
62-
if el.find_parent(['pre', 'code', 'kbd', 'samp']):
62+
if '_noformat' in parent_tags:
6363
return text
6464
prefix, suffix, text = chomp(text)
6565
if not text:
@@ -166,18 +166,13 @@ def convert(self, html):
166166
return self.convert_soup(soup)
167167

168168
def convert_soup(self, soup):
169-
return self.process_tag(soup, convert_as_inline=False)
169+
return self.process_tag(soup, parent_tags=set())
170170

171-
def process_tag(self, node, convert_as_inline):
172-
text = ''
171+
def process_tag(self, node, parent_tags=None):
172+
if parent_tags is None:
173+
parent_tags = set()
173174

174-
# For Markdown headings and table cells, convert children as inline
175-
# (so that block element children do not produce newlines).
176-
convert_children_as_inline = (
177-
convert_as_inline # propagated from parent
178-
or html_heading_re.match(node.name) is not None # headings
179-
or node.name in ['td', 'th'] # table cells
180-
)
175+
text = ''
181176

182177
# Collect child elements to process, ignoring whitespace-only text elements
183178
# adjacent to the inner/outer boundaries of block elements.
@@ -208,28 +203,48 @@ def _can_ignore(el):
208203

209204
children_to_convert = [child for child in node.children if not _can_ignore(child)]
210205

206+
node_name = node.name
207+
211208
# Convert the children first
212-
for el in children_to_convert:
213-
if isinstance(el, NavigableString):
214-
text += self.process_text(el)
215-
else:
216-
text_strip = text.rstrip('\n')
217-
newlines_left = len(text) - len(text_strip)
218-
next_text = self.process_tag(el, convert_children_as_inline)
219-
next_text_strip = next_text.lstrip('\n')
220-
newlines_right = len(next_text) - len(next_text_strip)
221-
newlines = '\n' * max(newlines_left, newlines_right)
222-
text = text_strip + newlines + next_text_strip
209+
if children_to_convert:
210+
# for children tags, start with a copy of the parent tag set
211+
parent_tags_for_children = set(parent_tags)
212+
213+
# add this tag's name as a parent
214+
parent_tags_for_children.add(node_name)
215+
216+
# if this tag is a heading or table cell, add an '_inline' parent pseudo-tag
217+
if (
218+
html_heading_re.match(node_name) is not None # headings
219+
or node_name in {'td', 'th'} # table cells
220+
):
221+
parent_tags_for_children.add('_inline')
222+
223+
# if this tag is a preformatted element, add a '_noformat' parent pseudo-tag
224+
if node_name in {'pre', 'code', 'kbd', 'samp'}:
225+
parent_tags_for_children.add('_noformat')
226+
227+
for el in children_to_convert:
228+
if isinstance(el, NavigableString):
229+
text += self.process_text(el, parent_tags=parent_tags_for_children)
230+
else:
231+
text_strip = text.rstrip('\n')
232+
newlines_left = len(text) - len(text_strip)
233+
next_text = self.process_tag(el, parent_tags=parent_tags_for_children)
234+
next_text_strip = next_text.lstrip('\n')
235+
newlines_right = len(next_text) - len(next_text_strip)
236+
newlines = '\n' * max(newlines_left, newlines_right)
237+
text = text_strip + newlines + next_text_strip
223238

224239
# apply this tag's final conversion function
225-
convert_fn_name = "convert_%s" % re.sub(r"[\[\]:-]", "_", node.name)
240+
convert_fn_name = "convert_%s" % re.sub(r"[\[\]:-]", "_", node_name)
226241
convert_fn = getattr(self, convert_fn_name, None)
227-
if convert_fn and self.should_convert_tag(node.name):
228-
text = convert_fn(node, text, convert_as_inline)
242+
if convert_fn and self.should_convert_tag(node_name):
243+
text = convert_fn(node, text, parent_tags=parent_tags)
229244

230245
return text
231246

232-
def convert__document_(self, el, text, convert_as_inline):
247+
def convert__document_(self, el, text, parent_tags):
233248
"""Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""
234249
if self.options['strip_document'] == LSTRIP:
235250
text = text.lstrip('\n') # remove leading separation newlines
@@ -244,19 +259,19 @@ def convert__document_(self, el, text, convert_as_inline):
244259

245260
return text
246261

247-
def process_text(self, el):
262+
def process_text(self, el, parent_tags):
248263
text = six.text_type(el) or ''
249264

250265
# normalize whitespace if we're not inside a preformatted element
251-
if not el.find_parent('pre'):
266+
if 'pre' not in parent_tags:
252267
if self.options['wrap']:
253268
text = all_whitespace_re.sub(' ', text)
254269
else:
255270
text = newline_whitespace_re.sub('\n', text)
256271
text = whitespace_re.sub(' ', text)
257272

258273
# escape special characters if we're not inside a preformatted or code element
259-
if not el.find_parent(['pre', 'code', 'kbd', 'samp']):
274+
if '_noformat' not in parent_tags:
260275
text = self.escape(text)
261276

262277
# remove leading whitespace at the start or just after a
@@ -279,8 +294,8 @@ def __getattr__(self, attr):
279294
if m:
280295
n = int(m.group(1))
281296

282-
def convert_tag(el, text, convert_as_inline):
283-
return self._convert_hn(n, el, text, convert_as_inline)
297+
def convert_tag(el, text, parent_tags):
298+
return self._convert_hn(n, el, text, parent_tags)
284299

285300
convert_tag.__name__ = 'convert_h%s' % n
286301
setattr(self, convert_tag.__name__, convert_tag)
@@ -327,8 +342,8 @@ def underline(self, text, pad_char):
327342
text = (text or '').rstrip()
328343
return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
329344

330-
def convert_a(self, el, text, convert_as_inline):
331-
if el.find_parent(['pre', 'code', 'kbd', 'samp']):
345+
def convert_a(self, el, text, parent_tags):
346+
if '_noformat' in parent_tags:
332347
return text
333348
prefix, suffix, text = chomp(text)
334349
if not text:
@@ -349,10 +364,10 @@ def convert_a(self, el, text, convert_as_inline):
349364

350365
convert_b = abstract_inline_conversion(lambda self: 2 * self.options['strong_em_symbol'])
351366

352-
def convert_blockquote(self, el, text, convert_as_inline):
367+
def convert_blockquote(self, el, text, parent_tags):
353368
# handle some early-exit scenarios
354369
text = (text or '').strip()
355-
if convert_as_inline:
370+
if '_inline' in parent_tags:
356371
return ' ' + text + ' '
357372
if not text:
358373
return "\n"
@@ -365,25 +380,25 @@ def _indent_for_blockquote(match):
365380

366381
return '\n' + text + '\n\n'
367382

368-
def convert_br(self, el, text, convert_as_inline):
369-
if convert_as_inline:
383+
def convert_br(self, el, text, parent_tags):
384+
if '_inline' in parent_tags:
370385
return ""
371386

372387
if self.options['newline_style'].lower() == BACKSLASH:
373388
return '\\\n'
374389
else:
375390
return ' \n'
376391

377-
def convert_code(self, el, text, convert_as_inline):
378-
if el.parent.name == 'pre':
392+
def convert_code(self, el, text, parent_tags):
393+
if 'pre' in parent_tags:
379394
return text
380395
converter = abstract_inline_conversion(lambda self: '`')
381-
return converter(self, el, text, convert_as_inline)
396+
return converter(self, el, text, parent_tags)
382397

383398
convert_del = abstract_inline_conversion(lambda self: '~~')
384399

385-
def convert_div(self, el, text, convert_as_inline):
386-
if convert_as_inline:
400+
def convert_div(self, el, text, parent_tags):
401+
if '_inline' in parent_tags:
387402
return ' ' + text.strip() + ' '
388403
text = text.strip()
389404
return '\n\n%s\n\n' % text if text else ''
@@ -396,9 +411,9 @@ def convert_div(self, el, text, convert_as_inline):
396411

397412
convert_kbd = convert_code
398413

399-
def convert_dd(self, el, text, convert_as_inline):
414+
def convert_dd(self, el, text, parent_tags):
400415
text = (text or '').strip()
401-
if convert_as_inline:
416+
if '_inline' in parent_tags:
402417
return ' ' + text + ' '
403418
if not text:
404419
return '\n'
@@ -414,11 +429,11 @@ def _indent_for_dd(match):
414429

415430
return '%s\n' % text
416431

417-
def convert_dt(self, el, text, convert_as_inline):
432+
def convert_dt(self, el, text, parent_tags):
418433
# remove newlines from term text
419434
text = (text or '').strip()
420435
text = all_whitespace_re.sub(' ', text)
421-
if convert_as_inline:
436+
if '_inline' in parent_tags:
422437
return ' ' + text + ' '
423438
if not text:
424439
return '\n'
@@ -428,9 +443,9 @@ def convert_dt(self, el, text, convert_as_inline):
428443

429444
return '\n%s\n' % text
430445

431-
def _convert_hn(self, n, el, text, convert_as_inline):
446+
def _convert_hn(self, n, el, text, parent_tags):
432447
""" Method name prefixed with _ to prevent <hn> to call this """
433-
if convert_as_inline:
448+
if '_inline' in parent_tags:
434449
return text
435450

436451
# prevent MemoryErrors in case of very large n
@@ -447,46 +462,41 @@ def _convert_hn(self, n, el, text, convert_as_inline):
447462
return '\n\n%s %s %s\n\n' % (hashes, text, hashes)
448463
return '\n\n%s %s\n\n' % (hashes, text)
449464

450-
def convert_hr(self, el, text, convert_as_inline):
465+
def convert_hr(self, el, text, parent_tags):
451466
return '\n\n---\n\n'
452467

453468
convert_i = convert_em
454469

455-
def convert_img(self, el, text, convert_as_inline):
470+
def convert_img(self, el, text, parent_tags):
456471
alt = el.attrs.get('alt', None) or ''
457472
src = el.attrs.get('src', None) or ''
458473
title = el.attrs.get('title', None) or ''
459474
title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
460-
if (convert_as_inline
475+
if ('_inline' in parent_tags
461476
and el.parent.name not in self.options['keep_inline_images_in']):
462477
return alt
463478

464479
return '![%s](%s%s)' % (alt, src, title_part)
465480

466-
def convert_list(self, el, text, convert_as_inline):
481+
def convert_list(self, el, text, parent_tags):
467482

468483
# Converting a list to inline is undefined.
469-
# Ignoring convert_to_inline for list.
484+
# Ignoring inline conversion parents for list.
470485

471486
nested = False
472487
before_paragraph = False
473488
next_sibling = _next_block_content_sibling(el)
474489
if next_sibling and next_sibling.name not in ['ul', 'ol']:
475490
before_paragraph = True
476-
while el:
477-
if el.name == 'li':
478-
nested = True
479-
break
480-
el = el.parent
481-
if nested:
482-
# remove trailing newline if nested
491+
if 'li' in parent_tags:
492+
# remove trailing newline if we're in a nested list
483493
return '\n' + text.rstrip()
484494
return '\n\n' + text + ('\n' if before_paragraph else '')
485495

486496
convert_ul = convert_list
487497
convert_ol = convert_list
488498

489-
def convert_li(self, el, text, convert_as_inline):
499+
def convert_li(self, el, text, parent_tags):
490500
# handle some early-exit scenarios
491501
text = (text or '').strip()
492502
if not text:
@@ -523,8 +533,8 @@ def _indent_for_li(match):
523533

524534
return '%s\n' % text
525535

526-
def convert_p(self, el, text, convert_as_inline):
527-
if convert_as_inline:
536+
def convert_p(self, el, text, parent_tags):
537+
if '_inline' in parent_tags:
528538
return ' ' + text.strip() + ' '
529539
text = text.strip()
530540
if self.options['wrap']:
@@ -546,7 +556,7 @@ def convert_p(self, el, text, convert_as_inline):
546556
text = '\n'.join(new_lines)
547557
return '\n\n%s\n\n' % text if text else ''
548558

549-
def convert_pre(self, el, text, convert_as_inline):
559+
def convert_pre(self, el, text, parent_tags):
550560
if not text:
551561
return ''
552562
code_language = self.options['code_language']
@@ -556,10 +566,10 @@ def convert_pre(self, el, text, convert_as_inline):
556566

557567
return '\n\n```%s\n%s\n```\n\n' % (code_language, text)
558568

559-
def convert_script(self, el, text, convert_as_inline):
569+
def convert_script(self, el, text, parent_tags):
560570
return ''
561571

562-
def convert_style(self, el, text, convert_as_inline):
572+
def convert_style(self, el, text, parent_tags):
563573
return ''
564574

565575
convert_s = convert_del
@@ -572,28 +582,28 @@ def convert_style(self, el, text, convert_as_inline):
572582

573583
convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol'])
574584

575-
def convert_table(self, el, text, convert_as_inline):
585+
def convert_table(self, el, text, parent_tags):
576586
return '\n\n' + text.strip() + '\n\n'
577587

578-
def convert_caption(self, el, text, convert_as_inline):
588+
def convert_caption(self, el, text, parent_tags):
579589
return text.strip() + '\n\n'
580590

581-
def convert_figcaption(self, el, text, convert_as_inline):
591+
def convert_figcaption(self, el, text, parent_tags):
582592
return '\n\n' + text.strip() + '\n\n'
583593

584-
def convert_td(self, el, text, convert_as_inline):
594+
def convert_td(self, el, text, parent_tags):
585595
colspan = 1
586596
if 'colspan' in el.attrs and el['colspan'].isdigit():
587597
colspan = int(el['colspan'])
588598
return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
589599

590-
def convert_th(self, el, text, convert_as_inline):
600+
def convert_th(self, el, text, parent_tags):
591601
colspan = 1
592602
if 'colspan' in el.attrs and el['colspan'].isdigit():
593603
colspan = int(el['colspan'])
594604
return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
595605

596-
def convert_tr(self, el, text, convert_as_inline):
606+
def convert_tr(self, el, text, parent_tags):
597607
cells = el.find_all(['td', 'th'])
598608
is_first_row = el.find_previous_sibling() is None
599609
is_headrow = (

tests/test_custom_converter.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,11 @@ class UnitTestConverter(MarkdownConverter):
66
"""
77
Create a custom MarkdownConverter for unit tests
88
"""
9-
def convert_img(self, el, text, convert_as_inline):
9+
def convert_img(self, el, text, parent_tags):
1010
"""Add two newlines after an image"""
11-
return super().convert_img(el, text, convert_as_inline) + '\n\n'
11+
return super().convert_img(el, text, parent_tags) + '\n\n'
1212

13-
def convert_custom_tag(self, el, text, convert_as_inline):
13+
def convert_custom_tag(self, el, text, parent_tags):
1414
"""Ensure conversion function is found for tags with special characters in name"""
1515
return "FUNCTION USED: %s" % text
1616

0 commit comments

Comments
 (0)