Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ See the [Contributing Guide](contributing.md) for details.
### Fixed

* Fix a regression related to comment handling (#1590).
* More reliable fix for `</`.

## [3.10.1] - 2026-01-21

Expand Down
54 changes: 34 additions & 20 deletions markdown/htmlparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@

# Included for versions which do not have current comment fix
commentclose = re.compile(r'--!?>')
commentabruptclose = re.compile(r'-?>')

# Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it.
# Users can still do `from html import parser` and get the default behavior.
Expand All @@ -48,6 +47,8 @@
# throwing it away. When we see it, we will process it as data.
htmlparser.starttagopen = re.compile('<[a-zA-Z]|</>')

htmlparser.endtagopen = re.compile('</[a-zA-Z]|</')

# Monkeypatch `HTMLParser` to only accept `?>` to close Processing Instructions.
htmlparser.piclose = re.compile(r'\?>')
# Monkeypatch `HTMLParser` to only recognize entity references with a closing semicolon.
Expand Down Expand Up @@ -92,6 +93,30 @@
blank_line_re = re.compile(r'^([ ]*\n){2}')


class _HTMLParser(htmlparser.HTMLParser):
"""Handle special start and end tags."""

def parse_endtag(self, i):
start = self.rawdata[i:i+3]
c = ord(start[-1])
if len(start) < 3 or not (65 <= c <= 90 or 97 <= c <= 122):
self.handle_data(self.rawdata[i:i + 2])
return i + 2
return super().parse_endtag(i)

def parse_starttag(self, i: int) -> int: # pragma: no cover
# Treat `</>` as normal data as it is not a real tag.
if self.rawdata[i:i + 3] == '</>':
self.handle_data(self.rawdata[i:i + 3])
return i + 3

return super().parse_starttag(i)


# Overwrite our custom one for people like MkDocs that pull it in
htmlparser.HTMLParser = _HTMLParser


class HTMLExtractor(htmlparser.HTMLParser):
"""
Extract raw HTML from text.
Expand All @@ -110,9 +135,6 @@ def __init__(self, md: Markdown, *args, **kwargs):

self.lineno_start_cache = [0]

self.override_comment_update = False
self.override_comment_start = 0

# This calls self.reset
super().__init__(*args, **kwargs)
self.md = md
Expand All @@ -125,8 +147,6 @@ def reset(self):
self._cache: list[str] = []
self.cleandoc: list[str] = []
self.lineno_start_cache = [0]
self.override_comment_start = 0
self.override_comment_update = False

super().reset()

Expand Down Expand Up @@ -207,6 +227,14 @@ def handle_starttag(self, tag: str, attrs: Sequence[tuple[str, str]]):
# This is presumably a standalone tag in a code span (see #1036).
self.clear_cdata_mode()

def parse_endtag(self, i):
start = self.rawdata[i:i+3]
c = ord(start[-1])
if len(start) < 3 or not (65 <= c <= 90 or 97 <= c <= 122):
self.handle_data(self.rawdata[i:i + 2])
return i + 2
return super().parse_endtag(i)

def handle_endtag(self, tag: str):
text = self.get_endtag_text(tag)

Expand Down Expand Up @@ -276,22 +304,8 @@ def handle_entityref(self, name: str):

def handle_comment(self, data: str):
# Check if the comment is unclosed, if so, we need to override position
j = self.rawdata.find(data)
i = j - 2
if self.rawdata[i:j] == '</':
self.handle_data('<')
self.override_comment_start = i
self.override_comment_update = True
return
self.handle_empty_tag('<!--{}-->'.format(data), is_block=True)

def updatepos(self, i: int, j: int) -> int:
if self.override_comment_update:
self.override_comment_update = False
i = self.override_comment_start
j = self.override_comment_start + 1
return super().updatepos(i, j)

def handle_decl(self, data: str):
self.handle_empty_tag('<!{}>'.format(data), is_block=True)

Expand Down
55 changes: 55 additions & 0 deletions tests/test_syntax/blocks/test_html_blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1732,3 +1732,58 @@ def test_issue_1590(self):
'''
)
)

def test_stress_comment_handling(self):
"""Stress test the comment handling."""

self.assertMarkdownRenders(
self.dedent(
'''
`</` <!-- `<!--[if mso]>` and <!-- </> and `<!--[if mso]>`
<!-- and <!-- `<!--[if mso]>` and </> `</` and `<!--[if mso]>`
<!-- Real comment -->
`<!--[if mso]>` `</` `<!--[if mso]>` and </> <!-- and <!--
</> `<!--[if mso]>` `</` <!-- and <!-- and `<!--[if mso]>`
'''
),
self.dedent(
'''
<p><code>&lt;/</code> &lt;!-- <code>&lt;!--[if mso]&gt;</code> and &lt;!-- &lt;/&gt; and <code>&lt;!--[if mso]&gt;</code></p>
<p>&lt;!-- and &lt;!-- <code>&lt;!--[if mso]&gt;</code> and &lt;/&gt; <code>&lt;/</code> and <code>&lt;!--[if mso]&gt;</code></p>
<!-- Real comment -->
<p><code>&lt;!--[if mso]&gt;</code> <code>&lt;/</code> <code>&lt;!--[if mso]&gt;</code> and &lt;/&gt; &lt;!-- and &lt;!--</p>
<p>&lt;/&gt; <code>&lt;!--[if mso]&gt;</code> <code>&lt;/</code> &lt;!-- and &lt;!-- and <code>&lt;!--[if mso]&gt;</code></p>
''' # noqa: E501
)
)

def test_unclosed_endtag(self):
"""Ensure unclosed end tag does not have side effects."""

self.assertMarkdownRenders(
self.dedent(
'''
`</`
<div>
<!--[if mso]>-->
<p>foo</p>
<!--<!endif]-->
</div>
'''
),
self.dedent(
'''
<p><code>&lt;/</code></p>
<div>
<!--[if mso]>-->
<p>foo</p>
<!--<!endif]-->
</div>
'''
)
)