Skip to content

Commit 23477f7

Browse files
authored
feat(HTML): Fallback showing formulas as images (#146)
* remove un-needed logic the labels allowlist is checked before Signed-off-by: Michele Dolfi <[email protected]> * textitem cannot have label code Signed-off-by: Michele Dolfi <[email protected]> * display formulas with mathml in exported html Signed-off-by: Michele Dolfi <[email protected]> * expose argument in save_as_html Signed-off-by: Michele Dolfi <[email protected]> * rename sanitize in prepare and add \n Signed-off-by: Michele Dolfi <[email protected]> * fix mypy parsing Signed-off-by: Michele Dolfi <[email protected]> * remove unused/impossible elif Signed-off-by: Michele Dolfi <[email protected]> * remove strip() Signed-off-by: Michele Dolfi <[email protected]> * add display none for latex annotation Signed-off-by: Michele Dolfi <[email protected]> * fallback showing equations as image Signed-off-by: Michele Dolfi <[email protected]> * add html placeholder Signed-off-by: Michele Dolfi <[email protected]> * markdown placeholder Signed-off-by: Michele Dolfi <[email protected]> --------- Signed-off-by: Michele Dolfi <[email protected]>
1 parent ed36437 commit 23477f7

8 files changed

+129
-10
lines changed

docling_core/types/doc/document.py

Lines changed: 52 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1393,6 +1393,17 @@ class DoclingDocument(BaseModel):
13931393
math annotation {
13941394
display: none;
13951395
}
1396+
.formula-not-decoded {
1397+
background: repeating-linear-gradient(
1398+
45deg, /* Angle of the stripes */
1399+
LightGray, /* First color */
1400+
LightGray 10px, /* Length of the first color */
1401+
White 10px, /* Second color */
1402+
White 20px /* Length of the second color */
1403+
);
1404+
margin: 0;
1405+
text-align: center;
1406+
}
13961407
</style>
13971408
</head>"""
13981409

@@ -2216,11 +2227,18 @@ def _append_text(text: str, do_escape_html=True, do_escape_underscores=True):
22162227

22172228
elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
22182229
in_list = False
2219-
_append_text(
2220-
f"$${item.text}$$\n",
2221-
do_escape_underscores=False,
2222-
do_escape_html=False,
2223-
)
2230+
if item.text != "":
2231+
_append_text(
2232+
f"$${item.text}$$\n",
2233+
do_escape_underscores=False,
2234+
do_escape_html=False,
2235+
)
2236+
elif item.orig != "":
2237+
_append_text(
2238+
"<!-- formula-not-decoded -->\n",
2239+
do_escape_underscores=False,
2240+
do_escape_html=False,
2241+
)
22242242

22252243
elif isinstance(item, TextItem) and item.label in labels:
22262244
in_list = False
@@ -2467,9 +2485,27 @@ def _prepare_tag_content(
24672485
math_formula = _prepare_tag_content(
24682486
item.text, do_escape_html=False, do_replace_newline=False
24692487
)
2470-
if formula_to_mathml:
2471-
# Building a math equation in MathML format
2472-
# ref https://www.w3.org/TR/wai-aria-1.1/#math
2488+
text = ""
2489+
2490+
# If the formula is not processed correcty, use its image
2491+
if (
2492+
item.text == ""
2493+
and item.orig != ""
2494+
and image_mode == ImageRefMode.EMBEDDED
2495+
and len(item.prov) > 0
2496+
):
2497+
item_image = item.get_image(doc=self)
2498+
if item_image is not None:
2499+
img_ref = ImageRef.from_pil(item_image, dpi=72)
2500+
text = (
2501+
"<figure>"
2502+
f'<img src="{img_ref.uri}" alt="{item.orig}" />'
2503+
"</figure>"
2504+
)
2505+
2506+
# Building a math equation in MathML format
2507+
# ref https://www.w3.org/TR/wai-aria-1.1/#math
2508+
elif formula_to_mathml:
24732509
mathml_element = latex2mathml.converter.convert_to_element(
24742510
math_formula, display="block"
24752511
)
@@ -2480,9 +2516,15 @@ def _prepare_tag_content(
24802516
mathml = unescape(tostring(mathml_element, encoding="unicode"))
24812517
text = f"<div>{mathml}</div>"
24822518

2483-
else:
2519+
elif math_formula != "":
24842520
text = f"<pre>{math_formula}</pre>"
2485-
html_texts.append(text)
2521+
2522+
if text != "":
2523+
html_texts.append(text)
2524+
else:
2525+
html_texts.append(
2526+
'<div class="formula-not-decoded">Formula not decoded</div>'
2527+
)
24862528

24872529
elif isinstance(item, ListItem):
24882530

test/data/doc/2206.01062.yaml.html

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,17 @@
5656
math annotation {
5757
display: none;
5858
}
59+
.formula-not-decoded {
60+
background: repeating-linear-gradient(
61+
45deg, /* Angle of the stripes */
62+
LightGray, /* First color */
63+
LightGray 10px, /* Length of the first color */
64+
White 10px, /* Second color */
65+
White 20px /* Length of the second color */
66+
);
67+
margin: 0;
68+
text-align: center;
69+
}
5970
</style>
6071
</head>
6172
<h2>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</h2>

test/data/doc/bad_doc.yaml.html

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,17 @@
5656
math annotation {
5757
display: none;
5858
}
59+
.formula-not-decoded {
60+
background: repeating-linear-gradient(
61+
45deg, /* Angle of the stripes */
62+
LightGray, /* First color */
63+
LightGray 10px, /* Length of the first color */
64+
White 10px, /* Second color */
65+
White 20px /* Length of the second color */
66+
);
67+
margin: 0;
68+
text-align: center;
69+
}
5970
</style>
6071
</head>
6172
<h1>This is the title</h1>

test/data/doc/constructed_doc.embedded.html.gt

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,17 @@
5656
math annotation {
5757
display: none;
5858
}
59+
.formula-not-decoded {
60+
background: repeating-linear-gradient(
61+
45deg, /* Angle of the stripes */
62+
LightGray, /* First color */
63+
LightGray 10px, /* Length of the first color */
64+
White 10px, /* Second color */
65+
White 20px /* Length of the second color */
66+
);
67+
margin: 0;
68+
text-align: center;
69+
}
5970
</style>
6071
</head>
6172
<h1>Title of the Document</h1>

test/data/doc/constructed_doc.placeholder.html.gt

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,17 @@
5656
math annotation {
5757
display: none;
5858
}
59+
.formula-not-decoded {
60+
background: repeating-linear-gradient(
61+
45deg, /* Angle of the stripes */
62+
LightGray, /* First color */
63+
LightGray 10px, /* Length of the first color */
64+
White 10px, /* Second color */
65+
White 20px /* Length of the second color */
66+
);
67+
margin: 0;
68+
text-align: center;
69+
}
5970
</style>
6071
</head>
6172
<h1>Title of the Document</h1>

test/data/doc/constructed_doc.referenced.html.gt

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,17 @@
5656
math annotation {
5757
display: none;
5858
}
59+
.formula-not-decoded {
60+
background: repeating-linear-gradient(
61+
45deg, /* Angle of the stripes */
62+
LightGray, /* First color */
63+
LightGray 10px, /* Length of the first color */
64+
White 10px, /* Second color */
65+
White 20px /* Length of the second color */
66+
);
67+
margin: 0;
68+
text-align: center;
69+
}
5970
</style>
6071
</head>
6172
<h1>Title of the Document</h1>

test/data/doc/constructed_document.yaml.html

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,17 @@
5656
math annotation {
5757
display: none;
5858
}
59+
.formula-not-decoded {
60+
background: repeating-linear-gradient(
61+
45deg, /* Angle of the stripes */
62+
LightGray, /* First color */
63+
LightGray 10px, /* Length of the first color */
64+
White 10px, /* Second color */
65+
White 20px /* Length of the second color */
66+
);
67+
margin: 0;
68+
text-align: center;
69+
}
5970
</style>
6071
</head>
6172
<h1>Title of the Document</h1>

test/data/doc/dummy_doc.yaml.html

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,17 @@
5656
math annotation {
5757
display: none;
5858
}
59+
.formula-not-decoded {
60+
background: repeating-linear-gradient(
61+
45deg, /* Angle of the stripes */
62+
LightGray, /* First color */
63+
LightGray 10px, /* Length of the first color */
64+
White 10px, /* Second color */
65+
White 20px /* Length of the second color */
66+
);
67+
margin: 0;
68+
text-align: center;
69+
}
5970
</style>
6071
</head>
6172
<h1>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</h1>

0 commit comments

Comments
 (0)