16
16
from pathlib import Path
17
17
from typing import Any , Dict , Final , List , Literal , Optional , Tuple , Union
18
18
from urllib .parse import quote , unquote
19
+ from xml .etree .cElementTree import SubElement , tostring
20
+ from xml .sax .saxutils import unescape
19
21
22
+ import latex2mathml .converter
20
23
import pandas as pd
21
24
import yaml
22
25
from PIL import Image as PILImage
@@ -1387,6 +1390,9 @@ class DoclingDocument(BaseModel):
1387
1390
table tr:nth-child(even) td{
1388
1391
background-color: LightGray;
1389
1392
}
1393
+ math annotation {
1394
+ display: none;
1395
+ }
1390
1396
</style>
1391
1397
</head>"""
1392
1398
@@ -2282,6 +2288,7 @@ def save_as_html(
2282
2288
to_element : int = sys .maxsize ,
2283
2289
labels : set [DocItemLabel ] = DEFAULT_EXPORT_LABELS ,
2284
2290
image_mode : ImageRefMode = ImageRefMode .PLACEHOLDER ,
2291
+ formula_to_mathml : bool = True ,
2285
2292
page_no : Optional [int ] = None ,
2286
2293
html_lang : str = "en" ,
2287
2294
html_head : str = _HTML_DEFAULT_HEAD ,
@@ -2301,6 +2308,7 @@ def save_as_html(
2301
2308
to_element = to_element ,
2302
2309
labels = labels ,
2303
2310
image_mode = image_mode ,
2311
+ formula_to_mathml = formula_to_mathml ,
2304
2312
page_no = page_no ,
2305
2313
html_lang = html_lang ,
2306
2314
html_head = html_head ,
@@ -2347,6 +2355,7 @@ def export_to_html( # noqa: C901
2347
2355
to_element : int = sys .maxsize ,
2348
2356
labels : set [DocItemLabel ] = DEFAULT_EXPORT_LABELS ,
2349
2357
image_mode : ImageRefMode = ImageRefMode .PLACEHOLDER ,
2358
+ formula_to_mathml : bool = True ,
2350
2359
page_no : Optional [int ] = None ,
2351
2360
html_lang : str = "en" ,
2352
2361
html_head : str = _HTML_DEFAULT_HEAD ,
@@ -2381,9 +2390,13 @@ def close_lists(
2381
2390
2382
2391
in_ordered_list : List [bool ] = [] # False
2383
2392
2384
- def _sanitize_text (text : str , do_escape_html = True ) -> str :
2393
+ def _prepare_tag_content (
2394
+ text : str , do_escape_html = True , do_replace_newline = True
2395
+ ) -> str :
2385
2396
if do_escape_html :
2386
2397
text = html .escape (text , quote = False )
2398
+ if do_replace_newline :
2399
+ text = text .replace ("\n " , "<br>" )
2387
2400
return text
2388
2401
2389
2402
for ix , (item , curr_level ) in enumerate (
@@ -2416,7 +2429,7 @@ def _sanitize_text(text: str, do_escape_html=True) -> str:
2416
2429
]:
2417
2430
2418
2431
text = "<ol>"
2419
- html_texts .append (text . strip () )
2432
+ html_texts .append (text )
2420
2433
2421
2434
# Increment list nesting level when entering a new list
2422
2435
in_ordered_list .append (True )
@@ -2426,7 +2439,7 @@ def _sanitize_text(text: str, do_escape_html=True) -> str:
2426
2439
]:
2427
2440
2428
2441
text = "<ul>"
2429
- html_texts .append (text . strip () )
2442
+ html_texts .append (text )
2430
2443
2431
2444
# Increment list nesting level when entering a new list
2432
2445
in_ordered_list .append (False )
@@ -2436,63 +2449,62 @@ def _sanitize_text(text: str, do_escape_html=True) -> str:
2436
2449
2437
2450
elif isinstance (item , TextItem ) and item .label in [DocItemLabel .TITLE ]:
2438
2451
2439
- text = f"<h1>{ _sanitize_text (item .text )} </h1>"
2440
- html_texts .append (text . strip () )
2452
+ text = f"<h1>{ _prepare_tag_content (item .text )} </h1>"
2453
+ html_texts .append (text )
2441
2454
2442
2455
elif isinstance (item , SectionHeaderItem ):
2443
2456
2444
- section_level : int = item .level + 1
2457
+ section_level : int = min ( item .level + 1 , 6 )
2445
2458
2446
2459
text = (
2447
2460
f"<h{ (section_level )} >"
2448
- f"{ _sanitize_text (item .text )} </h{ (section_level )} >"
2461
+ f"{ _prepare_tag_content (item .text )} </h{ (section_level )} >"
2449
2462
)
2450
- html_texts .append (text .strip ())
2451
-
2452
- elif isinstance (item , TextItem ) and item .label in [
2453
- DocItemLabel .SECTION_HEADER
2454
- ]:
2455
-
2456
- section_level = curr_level
2457
-
2458
- if section_level <= 1 :
2459
- section_level = 2
2463
+ html_texts .append (text )
2460
2464
2461
- if section_level >= 6 :
2462
- section_level = 6
2465
+ elif isinstance (item , TextItem ) and item .label in [DocItemLabel .FORMULA ]:
2463
2466
2464
- text = (
2465
- f"<h { section_level } > { _sanitize_text ( item .text ) } </h { section_level } >"
2467
+ math_formula = _prepare_tag_content (
2468
+ item .text , do_escape_html = False , do_replace_newline = False
2466
2469
)
2467
- html_texts .append (text .strip ())
2468
-
2469
- elif isinstance (item , TextItem ) and item .label in [DocItemLabel .CODE ]:
2470
+ if formula_to_mathml :
2471
+ # Building a math equation in MathML format
2472
+ # ref https://www.w3.org/TR/wai-aria-1.1/#math
2473
+ mathml_element = latex2mathml .converter .convert_to_element (
2474
+ math_formula , display = "block"
2475
+ )
2476
+ annotation = SubElement (
2477
+ mathml_element , "annotation" , dict (encoding = "TeX" )
2478
+ )
2479
+ annotation .text = math_formula
2480
+ mathml = unescape (tostring (mathml_element , encoding = "unicode" ))
2481
+ text = f"<div>{ mathml } </div>"
2470
2482
2471
- text = f"<pre>{ _sanitize_text (item .text , do_escape_html = False )} </pre>"
2483
+ else :
2484
+ text = f"<pre>{ math_formula } </pre>"
2472
2485
html_texts .append (text )
2473
2486
2474
2487
elif isinstance (item , ListItem ):
2475
2488
2476
- text = f"<li>{ _sanitize_text (item .text )} </li>"
2489
+ text = f"<li>{ _prepare_tag_content (item .text )} </li>"
2477
2490
html_texts .append (text )
2478
2491
2479
2492
elif isinstance (item , TextItem ) and item .label in [DocItemLabel .LIST_ITEM ]:
2480
2493
2481
- text = f"<li>{ _sanitize_text (item .text )} </li>"
2494
+ text = f"<li>{ _prepare_tag_content (item .text )} </li>"
2482
2495
html_texts .append (text )
2483
2496
2484
- elif isinstance (item , CodeItem ) and item .label in labels :
2485
- text = (
2486
- "<pre><code>"
2487
- f"{ _sanitize_text (item .text , do_escape_html = False )} "
2488
- "</code></pre>"
2497
+ elif isinstance (item , CodeItem ):
2498
+ code_text = _prepare_tag_content (
2499
+ item .text , do_escape_html = False , do_replace_newline = False
2489
2500
)
2490
- html_texts .append (text .strip ())
2501
+ text = f"<pre><code>{ code_text } </code></pre>"
2502
+ html_texts .append (text )
2491
2503
2492
- elif isinstance (item , TextItem ) and item . label in labels :
2504
+ elif isinstance (item , TextItem ):
2493
2505
2494
- text = f"<p>{ _sanitize_text (item .text )} </p>"
2495
- html_texts .append (text . strip () )
2506
+ text = f"<p>{ _prepare_tag_content (item .text )} </p>"
2507
+ html_texts .append (text )
2496
2508
elif isinstance (item , TableItem ):
2497
2509
2498
2510
text = item .export_to_html (doc = self , add_caption = True )
@@ -2513,8 +2525,7 @@ def _sanitize_text(text: str, do_escape_html=True) -> str:
2513
2525
2514
2526
lines = []
2515
2527
lines .extend (head_lines )
2516
- for i , line in enumerate (html_texts ):
2517
- lines .append (line .replace ("\n " , "<br>" ))
2528
+ lines .extend (html_texts )
2518
2529
2519
2530
delim = "\n "
2520
2531
html_text = (delim .join (lines )).strip ()
0 commit comments