Skip to content

Commit dd46342

Browse files
committed
remove unclosed tags
1 parent 474982f commit dd46342

File tree

1 file changed

+18
-6
lines changed

1 file changed

+18
-6
lines changed

v2_utils.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,28 +27,40 @@ def define_link_data(usernames):
2727
logging.info(f"{e}---define_link_data")
2828
return []
2929

30+
3031
def remove_unmatched_tags(text):
3132
try:
32-
# Remove unmatched closing tags at the beginning of the string
33+
# Remove unmatched closing tags at the beginning of the string
3334
text = re.sub(r'^\s*</[^>]+>\s*', '', text)
34-
3535
# Regex pattern to find matched or unmatched tags
36-
pattern = re.compile(r'(<([^>]+)>.*?</\2>)|(<[^/][^>]*>.*)', re.DOTALL)
36+
pattern = re.compile(r'(<([^>]+)>.*?</\2>)|(<[^/][^>]*>.*?)(?=<[^/][^>]*>|$)', re.DOTALL)
3737
matches = pattern.findall(text)
38-
38+
3939
cleaned_text = ''
40+
open_tags = []
41+
4042
for match in matches:
4143
if match[0]: # Full matched <tag>...</tag> pairs
4244
cleaned_text += match[0]
4345
elif match[2]: # Unmatched opening <tag> tags
46+
# Add the tag to the list of open tags
47+
tag = re.match(r'<([^/][^>]*)>', match[2])
48+
if tag:
49+
tag_name = tag.group(1).split()[0]
50+
open_tags.append(tag_name)
4451
cleaned_text += match[2]
45-
52+
53+
# Close any unmatched opening tags
54+
while open_tags:
55+
tag = open_tags.pop()
56+
cleaned_text += f'</{tag}>'
57+
4658
return cleaned_text
59+
4760
except Exception as e:
4861
print(e)
4962
return text
5063

51-
5264

5365

5466
def week_data_formatter(html_content, type):

0 commit comments

Comments
 (0)