feat(HTML): Export formulas with mathml (#144)

dolfim-ibm · web-flow · commit ed3643734617 · 2025-01-31T15:43:43.000+01:00
* remove un-needed logic

the labels allowlist is checked before

Signed-off-by: Michele Dolfi &lt;dol@zurich.ibm.com&gt;

* textitem cannot have label code

Signed-off-by: Michele Dolfi &lt;dol@zurich.ibm.com&gt;

* display formulas with mathml in exported html

Signed-off-by: Michele Dolfi &lt;dol@zurich.ibm.com&gt;

* expose argument in save_as_html

Signed-off-by: Michele Dolfi &lt;dol@zurich.ibm.com&gt;

* rename sanitize in prepare and add \n

Signed-off-by: Michele Dolfi &lt;dol@zurich.ibm.com&gt;

* fix mypy parsing

Signed-off-by: Michele Dolfi &lt;dol@zurich.ibm.com&gt;

* remove unused/impossible elif

Signed-off-by: Michele Dolfi &lt;dol@zurich.ibm.com&gt;

* remove strip()

Signed-off-by: Michele Dolfi &lt;dol@zurich.ibm.com&gt;

* add display none for latex annotation

Signed-off-by: Michele Dolfi &lt;dol@zurich.ibm.com&gt;

* make mathml the default

Signed-off-by: Michele Dolfi &lt;dol@zurich.ibm.com&gt;

* revert wrong commit

Signed-off-by: Michele Dolfi &lt;dol@zurich.ibm.com&gt;

---------

Signed-off-by: Michele Dolfi &lt;dol@zurich.ibm.com&gt;
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
@@ -16,7 +16,10 @@
 from pathlib import Path
 from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
 from urllib.parse import quote, unquote
+from xml.etree.cElementTree import SubElement, tostring
+from xml.sax.saxutils import unescape
 
+import latex2mathml.converter
 import pandas as pd
 import yaml
 from PIL import Image as PILImage
@@ -1387,6 +1390,9 @@ class DoclingDocument(BaseModel):
     table tr:nth-child(even) td{
     background-color: LightGray;
     }
+    math annotation {
+    display: none;
+    }
     </style>
     </head>"""
 
@@ -2282,6 +2288,7 @@ def save_as_html(
         to_element: int = sys.maxsize,
         labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
         image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
+        formula_to_mathml: bool = True,
         page_no: Optional[int] = None,
         html_lang: str = "en",
         html_head: str = _HTML_DEFAULT_HEAD,
@@ -2301,6 +2308,7 @@ def save_as_html(
             to_element=to_element,
             labels=labels,
             image_mode=image_mode,
+            formula_to_mathml=formula_to_mathml,
             page_no=page_no,
             html_lang=html_lang,
             html_head=html_head,
@@ -2347,6 +2355,7 @@ def export_to_html(  # noqa: C901
         to_element: int = sys.maxsize,
         labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
         image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
+        formula_to_mathml: bool = True,
         page_no: Optional[int] = None,
         html_lang: str = "en",
         html_head: str = _HTML_DEFAULT_HEAD,
@@ -2381,9 +2390,13 @@ def close_lists(
 
         in_ordered_list: List[bool] = []  # False
 
-        def _sanitize_text(text: str, do_escape_html=True) -> str:
+        def _prepare_tag_content(
+            text: str, do_escape_html=True, do_replace_newline=True
+        ) -> str:
             if do_escape_html:
                 text = html.escape(text, quote=False)
+            if do_replace_newline:
+                text = text.replace("\n", "<br>")
             return text
 
         for ix, (item, curr_level) in enumerate(
@@ -2416,7 +2429,7 @@ def _sanitize_text(text: str, do_escape_html=True) -> str:
             ]:
 
                 text = "<ol>"
-                html_texts.append(text.strip())
+                html_texts.append(text)
 
                 # Increment list nesting level when entering a new list
                 in_ordered_list.append(True)
@@ -2426,7 +2439,7 @@ def _sanitize_text(text: str, do_escape_html=True) -> str:
             ]:
 
                 text = "<ul>"
-                html_texts.append(text.strip())
+                html_texts.append(text)
 
                 # Increment list nesting level when entering a new list
                 in_ordered_list.append(False)
@@ -2436,63 +2449,62 @@ def _sanitize_text(text: str, do_escape_html=True) -> str:
 
             elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
 
-                text = f"<h1>{_sanitize_text(item.text)}</h1>"
-                html_texts.append(text.strip())
+                text = f"<h1>{_prepare_tag_content(item.text)}</h1>"
+                html_texts.append(text)
 
             elif isinstance(item, SectionHeaderItem):
 
-                section_level: int = item.level + 1
+                section_level: int = min(item.level + 1, 6)
 
                 text = (
                     f"<h{(section_level)}>"
-                    f"{_sanitize_text(item.text)}</h{(section_level)}>"
+                    f"{_prepare_tag_content(item.text)}</h{(section_level)}>"
                 )
-                html_texts.append(text.strip())
-
-            elif isinstance(item, TextItem) and item.label in [
-                DocItemLabel.SECTION_HEADER
-            ]:
-
-                section_level = curr_level
-
-                if section_level <= 1:
-                    section_level = 2
+                html_texts.append(text)
 
-                if section_level >= 6:
-                    section_level = 6
+            elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
 
-                text = (
-                    f"<h{section_level}>{_sanitize_text(item.text)}</h{section_level}>"
+                math_formula = _prepare_tag_content(
+                    item.text, do_escape_html=False, do_replace_newline=False
                 )
-                html_texts.append(text.strip())
-
-            elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:
+                if formula_to_mathml:
+                    # Building a math equation in MathML format
+                    # ref https://www.w3.org/TR/wai-aria-1.1/#math
+                    mathml_element = latex2mathml.converter.convert_to_element(
+                        math_formula, display="block"
+                    )
+                    annotation = SubElement(
+                        mathml_element, "annotation", dict(encoding="TeX")
+                    )
+                    annotation.text = math_formula
+                    mathml = unescape(tostring(mathml_element, encoding="unicode"))
+                    text = f"<div>{mathml}</div>"
 
-                text = f"<pre>{_sanitize_text(item.text, do_escape_html=False)}</pre>"
+                else:
+                    text = f"<pre>{math_formula}</pre>"
                 html_texts.append(text)
 
             elif isinstance(item, ListItem):
 
-                text = f"<li>{_sanitize_text(item.text)}</li>"
+                text = f"<li>{_prepare_tag_content(item.text)}</li>"
                 html_texts.append(text)
 
             elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:
 
-                text = f"<li>{_sanitize_text(item.text)}</li>"
+                text = f"<li>{_prepare_tag_content(item.text)}</li>"
                 html_texts.append(text)
 
-            elif isinstance(item, CodeItem) and item.label in labels:
-                text = (
-                    "<pre><code>"
-                    f"{_sanitize_text(item.text, do_escape_html=False)}"
-                    "</code></pre>"
+            elif isinstance(item, CodeItem):
+                code_text = _prepare_tag_content(
+                    item.text, do_escape_html=False, do_replace_newline=False
                 )
-                html_texts.append(text.strip())
+                text = f"<pre><code>{code_text}</code></pre>"
+                html_texts.append(text)
 
-            elif isinstance(item, TextItem) and item.label in labels:
+            elif isinstance(item, TextItem):
 
-                text = f"<p>{_sanitize_text(item.text)}</p>"
-                html_texts.append(text.strip())
+                text = f"<p>{_prepare_tag_content(item.text)}</p>"
+                html_texts.append(text)
             elif isinstance(item, TableItem):
 
                 text = item.export_to_html(doc=self, add_caption=True)
@@ -2513,8 +2525,7 @@ def _sanitize_text(text: str, do_escape_html=True) -> str:
 
         lines = []
         lines.extend(head_lines)
-        for i, line in enumerate(html_texts):
-            lines.append(line.replace("\n", "<br>"))
+        lines.extend(html_texts)
 
         delim = "\n"
         html_text = (delim.join(lines)).strip()
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -59,6 +59,7 @@ typing-extensions = "^4.12.2"
 transformers = { version = "^4.34.0", optional = true }
 semchunk = { version = "^2.2.0", optional = true }
 typer = "^0.12.5"
+latex2mathml = "^3.77.0"
 
 [tool.poetry.extras]
 chunking = ["transformers", "semchunk"]
diff --git a/test/data/doc/2206.01062.yaml.html b/test/data/doc/2206.01062.yaml.html
@@ -53,6 +53,9 @@
     table tr:nth-child(even) td{
     background-color: LightGray;
     }
+    math annotation {
+    display: none;
+    }
     </style>
     </head>
 <h2>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</h2>
diff --git a/test/data/doc/bad_doc.yaml.html b/test/data/doc/bad_doc.yaml.html
@@ -53,6 +53,9 @@
     table tr:nth-child(even) td{
     background-color: LightGray;
     }
+    math annotation {
+    display: none;
+    }
     </style>
     </head>
 <h1>This is the title</h1>
diff --git a/test/data/doc/constructed_doc.embedded.html.gt b/test/data/doc/constructed_doc.embedded.html.gt
@@ -53,6 +53,9 @@
     table tr:nth-child(even) td{
     background-color: LightGray;
     }
+    math annotation {
+    display: none;
+    }
     </style>
     </head>
 <h1>Title of the Document</h1>
diff --git a/test/data/doc/constructed_doc.placeholder.html.gt b/test/data/doc/constructed_doc.placeholder.html.gt
@@ -53,6 +53,9 @@
     table tr:nth-child(even) td{
     background-color: LightGray;
     }
+    math annotation {
+    display: none;
+    }
     </style>
     </head>
 <h1>Title of the Document</h1>
diff --git a/test/data/doc/constructed_doc.referenced.html.gt b/test/data/doc/constructed_doc.referenced.html.gt
@@ -53,6 +53,9 @@
     table tr:nth-child(even) td{
     background-color: LightGray;
     }
+    math annotation {
+    display: none;
+    }
     </style>
     </head>
 <h1>Title of the Document</h1>
diff --git a/test/data/doc/constructed_document.yaml.html b/test/data/doc/constructed_document.yaml.html
@@ -53,6 +53,9 @@
     table tr:nth-child(even) td{
     background-color: LightGray;
     }
+    math annotation {
+    display: none;
+    }
     </style>
     </head>
 <h1>Title of the Document</h1>
diff --git a/test/data/doc/dummy_doc.yaml.html b/test/data/doc/dummy_doc.yaml.html
@@ -53,6 +53,9 @@
     table tr:nth-child(even) td{
     background-color: LightGray;
     }
+    math annotation {
+    display: none;
+    }
     </style>
     </head>
 <h1>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</h1>
diff --git a/test/data/docling_document/export/formula_mathml.html b/test/data/docling_document/export/formula_mathml.html
@@ -0,0 +1,5 @@
+<!DOCTYPE html>
+<html lang="en">
+
+<div><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><mrow><mfrac><mrow><mn>1</mn></mrow><mrow><mi>x</mi></mrow></mfrac></mrow><annotation encoding="TeX">\frac{1}{x}</annotation></math></div>
+</html>
diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py
@@ -661,6 +661,20 @@ def test_version_doc():
     assert doc.version == CURRENT_VERSION
 
 
+def test_formula_mathml():
+    doc = DoclingDocument(name="Dummy")
+    equation = "\\frac{1}{x}"
+    doc.add_text(label=DocItemLabel.FORMULA, text=equation)
+
+    doc_html = doc.export_to_html(formula_to_mathml=True, html_head="")
+
+    gt_html = Path("test/data/docling_document/export/formula_mathml.html").read_text(
+        encoding="utf8"
+    )
+
+    assert doc_html == gt_html
+
+
 def test_docitem_get_image():
     # Prepare the document
     doc = DoclingDocument(name="Dummy")