Skip to content

Commit ed36437

Browse files
authored
feat(HTML): Export formulas with mathml (#144)
* remove un-needed logic the labels allowlist is checked before Signed-off-by: Michele Dolfi <[email protected]> * textitem cannot have label code Signed-off-by: Michele Dolfi <[email protected]> * display formulas with mathml in exported html Signed-off-by: Michele Dolfi <[email protected]> * expose argument in save_as_html Signed-off-by: Michele Dolfi <[email protected]> * rename sanitize in prepare and add \n Signed-off-by: Michele Dolfi <[email protected]> * fix mypy parsing Signed-off-by: Michele Dolfi <[email protected]> * remove unused/impossible elif Signed-off-by: Michele Dolfi <[email protected]> * remove strip() Signed-off-by: Michele Dolfi <[email protected]> * add display none for latex annotation Signed-off-by: Michele Dolfi <[email protected]> * make mathml the default Signed-off-by: Michele Dolfi <[email protected]> * revert wrong commit Signed-off-by: Michele Dolfi <[email protected]> --------- Signed-off-by: Michele Dolfi <[email protected]>
1 parent eb96e31 commit ed36437

12 files changed

+102
-39
lines changed

docling_core/types/doc/document.py

Lines changed: 49 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@
1616
from pathlib import Path
1717
from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
1818
from urllib.parse import quote, unquote
19+
from xml.etree.cElementTree import SubElement, tostring
20+
from xml.sax.saxutils import unescape
1921

22+
import latex2mathml.converter
2023
import pandas as pd
2124
import yaml
2225
from PIL import Image as PILImage
@@ -1387,6 +1390,9 @@ class DoclingDocument(BaseModel):
13871390
table tr:nth-child(even) td{
13881391
background-color: LightGray;
13891392
}
1393+
math annotation {
1394+
display: none;
1395+
}
13901396
</style>
13911397
</head>"""
13921398

@@ -2282,6 +2288,7 @@ def save_as_html(
22822288
to_element: int = sys.maxsize,
22832289
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
22842290
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
2291+
formula_to_mathml: bool = True,
22852292
page_no: Optional[int] = None,
22862293
html_lang: str = "en",
22872294
html_head: str = _HTML_DEFAULT_HEAD,
@@ -2301,6 +2308,7 @@ def save_as_html(
23012308
to_element=to_element,
23022309
labels=labels,
23032310
image_mode=image_mode,
2311+
formula_to_mathml=formula_to_mathml,
23042312
page_no=page_no,
23052313
html_lang=html_lang,
23062314
html_head=html_head,
@@ -2347,6 +2355,7 @@ def export_to_html( # noqa: C901
23472355
to_element: int = sys.maxsize,
23482356
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
23492357
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
2358+
formula_to_mathml: bool = True,
23502359
page_no: Optional[int] = None,
23512360
html_lang: str = "en",
23522361
html_head: str = _HTML_DEFAULT_HEAD,
@@ -2381,9 +2390,13 @@ def close_lists(
23812390

23822391
in_ordered_list: List[bool] = [] # False
23832392

2384-
def _sanitize_text(text: str, do_escape_html=True) -> str:
2393+
def _prepare_tag_content(
2394+
text: str, do_escape_html=True, do_replace_newline=True
2395+
) -> str:
23852396
if do_escape_html:
23862397
text = html.escape(text, quote=False)
2398+
if do_replace_newline:
2399+
text = text.replace("\n", "<br>")
23872400
return text
23882401

23892402
for ix, (item, curr_level) in enumerate(
@@ -2416,7 +2429,7 @@ def _sanitize_text(text: str, do_escape_html=True) -> str:
24162429
]:
24172430

24182431
text = "<ol>"
2419-
html_texts.append(text.strip())
2432+
html_texts.append(text)
24202433

24212434
# Increment list nesting level when entering a new list
24222435
in_ordered_list.append(True)
@@ -2426,7 +2439,7 @@ def _sanitize_text(text: str, do_escape_html=True) -> str:
24262439
]:
24272440

24282441
text = "<ul>"
2429-
html_texts.append(text.strip())
2442+
html_texts.append(text)
24302443

24312444
# Increment list nesting level when entering a new list
24322445
in_ordered_list.append(False)
@@ -2436,63 +2449,62 @@ def _sanitize_text(text: str, do_escape_html=True) -> str:
24362449

24372450
elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
24382451

2439-
text = f"<h1>{_sanitize_text(item.text)}</h1>"
2440-
html_texts.append(text.strip())
2452+
text = f"<h1>{_prepare_tag_content(item.text)}</h1>"
2453+
html_texts.append(text)
24412454

24422455
elif isinstance(item, SectionHeaderItem):
24432456

2444-
section_level: int = item.level + 1
2457+
section_level: int = min(item.level + 1, 6)
24452458

24462459
text = (
24472460
f"<h{(section_level)}>"
2448-
f"{_sanitize_text(item.text)}</h{(section_level)}>"
2461+
f"{_prepare_tag_content(item.text)}</h{(section_level)}>"
24492462
)
2450-
html_texts.append(text.strip())
2451-
2452-
elif isinstance(item, TextItem) and item.label in [
2453-
DocItemLabel.SECTION_HEADER
2454-
]:
2455-
2456-
section_level = curr_level
2457-
2458-
if section_level <= 1:
2459-
section_level = 2
2463+
html_texts.append(text)
24602464

2461-
if section_level >= 6:
2462-
section_level = 6
2465+
elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
24632466

2464-
text = (
2465-
f"<h{section_level}>{_sanitize_text(item.text)}</h{section_level}>"
2467+
math_formula = _prepare_tag_content(
2468+
item.text, do_escape_html=False, do_replace_newline=False
24662469
)
2467-
html_texts.append(text.strip())
2468-
2469-
elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:
2470+
if formula_to_mathml:
2471+
# Building a math equation in MathML format
2472+
# ref https://www.w3.org/TR/wai-aria-1.1/#math
2473+
mathml_element = latex2mathml.converter.convert_to_element(
2474+
math_formula, display="block"
2475+
)
2476+
annotation = SubElement(
2477+
mathml_element, "annotation", dict(encoding="TeX")
2478+
)
2479+
annotation.text = math_formula
2480+
mathml = unescape(tostring(mathml_element, encoding="unicode"))
2481+
text = f"<div>{mathml}</div>"
24702482

2471-
text = f"<pre>{_sanitize_text(item.text, do_escape_html=False)}</pre>"
2483+
else:
2484+
text = f"<pre>{math_formula}</pre>"
24722485
html_texts.append(text)
24732486

24742487
elif isinstance(item, ListItem):
24752488

2476-
text = f"<li>{_sanitize_text(item.text)}</li>"
2489+
text = f"<li>{_prepare_tag_content(item.text)}</li>"
24772490
html_texts.append(text)
24782491

24792492
elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:
24802493

2481-
text = f"<li>{_sanitize_text(item.text)}</li>"
2494+
text = f"<li>{_prepare_tag_content(item.text)}</li>"
24822495
html_texts.append(text)
24832496

2484-
elif isinstance(item, CodeItem) and item.label in labels:
2485-
text = (
2486-
"<pre><code>"
2487-
f"{_sanitize_text(item.text, do_escape_html=False)}"
2488-
"</code></pre>"
2497+
elif isinstance(item, CodeItem):
2498+
code_text = _prepare_tag_content(
2499+
item.text, do_escape_html=False, do_replace_newline=False
24892500
)
2490-
html_texts.append(text.strip())
2501+
text = f"<pre><code>{code_text}</code></pre>"
2502+
html_texts.append(text)
24912503

2492-
elif isinstance(item, TextItem) and item.label in labels:
2504+
elif isinstance(item, TextItem):
24932505

2494-
text = f"<p>{_sanitize_text(item.text)}</p>"
2495-
html_texts.append(text.strip())
2506+
text = f"<p>{_prepare_tag_content(item.text)}</p>"
2507+
html_texts.append(text)
24962508
elif isinstance(item, TableItem):
24972509

24982510
text = item.export_to_html(doc=self, add_caption=True)
@@ -2513,8 +2525,7 @@ def _sanitize_text(text: str, do_escape_html=True) -> str:
25132525

25142526
lines = []
25152527
lines.extend(head_lines)
2516-
for i, line in enumerate(html_texts):
2517-
lines.append(line.replace("\n", "<br>"))
2528+
lines.extend(html_texts)
25182529

25192530
delim = "\n"
25202531
html_text = (delim.join(lines)).strip()

poetry.lock

Lines changed: 12 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ typing-extensions = "^4.12.2"
5959
transformers = { version = "^4.34.0", optional = true }
6060
semchunk = { version = "^2.2.0", optional = true }
6161
typer = "^0.12.5"
62+
latex2mathml = "^3.77.0"
6263

6364
[tool.poetry.extras]
6465
chunking = ["transformers", "semchunk"]

test/data/doc/2206.01062.yaml.html

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@
5353
table tr:nth-child(even) td{
5454
background-color: LightGray;
5555
}
56+
math annotation {
57+
display: none;
58+
}
5659
</style>
5760
</head>
5861
<h2>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</h2>

test/data/doc/bad_doc.yaml.html

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@
5353
table tr:nth-child(even) td{
5454
background-color: LightGray;
5555
}
56+
math annotation {
57+
display: none;
58+
}
5659
</style>
5760
</head>
5861
<h1>This is the title</h1>

test/data/doc/constructed_doc.embedded.html.gt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@
5353
table tr:nth-child(even) td{
5454
background-color: LightGray;
5555
}
56+
math annotation {
57+
display: none;
58+
}
5659
</style>
5760
</head>
5861
<h1>Title of the Document</h1>

test/data/doc/constructed_doc.placeholder.html.gt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@
5353
table tr:nth-child(even) td{
5454
background-color: LightGray;
5555
}
56+
math annotation {
57+
display: none;
58+
}
5659
</style>
5760
</head>
5861
<h1>Title of the Document</h1>

test/data/doc/constructed_doc.referenced.html.gt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@
5353
table tr:nth-child(even) td{
5454
background-color: LightGray;
5555
}
56+
math annotation {
57+
display: none;
58+
}
5659
</style>
5760
</head>
5861
<h1>Title of the Document</h1>

test/data/doc/constructed_document.yaml.html

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@
5353
table tr:nth-child(even) td{
5454
background-color: LightGray;
5555
}
56+
math annotation {
57+
display: none;
58+
}
5659
</style>
5760
</head>
5861
<h1>Title of the Document</h1>

test/data/doc/dummy_doc.yaml.html

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@
5353
table tr:nth-child(even) td{
5454
background-color: LightGray;
5555
}
56+
math annotation {
57+
display: none;
58+
}
5659
</style>
5760
</head>
5861
<h1>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</h1>
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
4+
<div><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><mrow><mfrac><mrow><mn>1</mn></mrow><mrow><mi>x</mi></mrow></mfrac></mrow><annotation encoding="TeX">\frac{1}{x}</annotation></math></div>
5+
</html>

test/test_docling_doc.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -661,6 +661,20 @@ def test_version_doc():
661661
assert doc.version == CURRENT_VERSION
662662

663663

664+
def test_formula_mathml():
665+
doc = DoclingDocument(name="Dummy")
666+
equation = "\\frac{1}{x}"
667+
doc.add_text(label=DocItemLabel.FORMULA, text=equation)
668+
669+
doc_html = doc.export_to_html(formula_to_mathml=True, html_head="")
670+
671+
gt_html = Path("test/data/docling_document/export/formula_mathml.html").read_text(
672+
encoding="utf8"
673+
)
674+
675+
assert doc_html == gt_html
676+
677+
664678
def test_docitem_get_image():
665679
# Prepare the document
666680
doc = DoclingDocument(name="Dummy")

0 commit comments

Comments
 (0)