3535
3636# Included for versions which do not have current comment fix
3737commentclose = re .compile (r'--!?>' )
38- commentabruptclose = re .compile (r'-?>' )
3938
4039# Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it.
4140# Users can still do `from html import parser` and get the default behavior.
4847# throwing it away. When we see it, we will process it as data.
4948htmlparser .starttagopen = re .compile ('<[a-zA-Z]|</>' )
5049
50+ htmlparser .endtagopen = re .compile ('</[a-zA-Z]?' )
51+
5152# Monkeypatch `HTMLParser` to only accept `?>` to close Processing Instructions.
5253htmlparser .piclose = re .compile (r'\?>' )
5354# Monkeypatch `HTMLParser` to only recognize entity references with a closing semicolon.
9293blank_line_re = re .compile (r'^([ ]*\n){2}' )
9394
9495
96+ class _HTMLParser (htmlparser .HTMLParser ):
97+ """Handle special start and end tags."""
98+
99+ def parse_endtag (self , i ):
100+ start = self .rawdata [i :i + 3 ]
101+ c = ord (start [- 1 ])
102+ if len (start ) < 3 or not (65 <= c <= 90 or 97 <= c <= 122 ):
103+ self .handle_data (self .rawdata [i :i + 2 ])
104+ return i + 2
105+ return super ().parse_endtag (i )
106+
107+ def parse_starttag (self , i : int ) -> int : # pragma: no cover
108+ # Treat `</>` as normal data as it is not a real tag.
109+ if self .rawdata [i :i + 3 ] == '</>' :
110+ self .handle_data (self .rawdata [i :i + 3 ])
111+ return i + 3
112+
113+ return super ().parse_starttag (i )
114+
115+
116+ # Overwrite our custom one for people like MkDocs that pull it in
117+ htmlparser .HTMLParser = _HTMLParser
118+
119+
95120class HTMLExtractor (htmlparser .HTMLParser ):
96121 """
97122 Extract raw HTML from text.
@@ -110,9 +135,6 @@ def __init__(self, md: Markdown, *args, **kwargs):
110135
111136 self .lineno_start_cache = [0 ]
112137
113- self .override_comment_update = False
114- self .override_comment_start = 0
115-
116138 # This calls self.reset
117139 super ().__init__ (* args , ** kwargs )
118140 self .md = md
@@ -125,8 +147,6 @@ def reset(self):
125147 self ._cache : list [str ] = []
126148 self .cleandoc : list [str ] = []
127149 self .lineno_start_cache = [0 ]
128- self .override_comment_start = 0
129- self .override_comment_update = False
130150
131151 super ().reset ()
132152
@@ -276,22 +296,8 @@ def handle_entityref(self, name: str):
276296
277297 def handle_comment (self , data : str ):
278298 # Check if the comment is unclosed, if so, we need to override position
279- j = self .rawdata .find (data )
280- i = j - 2
281- if self .rawdata [i :j ] == '</' :
282- self .handle_data ('<' )
283- self .override_comment_start = i
284- self .override_comment_update = True
285- return
286299 self .handle_empty_tag ('<!--{}-->' .format (data ), is_block = True )
287300
288- def updatepos (self , i : int , j : int ) -> int :
289- if self .override_comment_update :
290- self .override_comment_update = False
291- i = self .override_comment_start
292- j = self .override_comment_start + 1
293- return super ().updatepos (i , j )
294-
295301 def handle_decl (self , data : str ):
296302 self .handle_empty_tag ('<!{}>' .format (data ), is_block = True )
297303
0 commit comments