Skip to content

Commit 3cf31cb

Browse files
fix: Define LTR/RTL text direction in HTML export (#152)
* fix: Detect RTL text and put according HTML tags Signed-off-by: Christoph Auer <[email protected]> * proposal for cau/rtl-text Signed-off-by: Peter Staar <[email protected]> * fix: Form HTML tags with utility method Signed-off-by: Christoph Auer <[email protected]> * Update tests Signed-off-by: Christoph Auer <[email protected]> * Remove commented code Signed-off-by: Christoph Auer <[email protected]> * Add back escaping Signed-off-by: Christoph Auer <[email protected]> --------- Signed-off-by: Christoph Auer <[email protected]> Signed-off-by: Peter Staar <[email protected]> Co-authored-by: Peter Staar <[email protected]>
1 parent 327f902 commit 3cf31cb

File tree

2 files changed

+64
-13
lines changed

2 files changed

+64
-13
lines changed

docling_core/types/doc/document.py

+37-13
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,11 @@
4444
from docling_core.types.doc.base import ImageRefMode
4545
from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel, GroupLabel
4646
from docling_core.types.doc.tokens import DocumentToken, TableToken
47-
from docling_core.types.doc.utils import relative_path
47+
from docling_core.types.doc.utils import (
48+
get_html_tag_with_text_direction,
49+
get_text_direction,
50+
relative_path,
51+
)
4852

4953
_logger = logging.getLogger(__name__)
5054

@@ -866,7 +870,9 @@ def export_to_html(
866870

867871
caption_text = ""
868872
if len(text) > 0:
869-
caption_text = f"<figcaption>{text}</figcaption>"
873+
caption_text = get_html_tag_with_text_direction(
874+
html_tag="figcaption", text=text
875+
)
870876

871877
default_response = f"<figure>{caption_text}</figure>"
872878

@@ -1090,15 +1096,28 @@ def export_to_html(
10901096
if colspan > 1:
10911097
opening_tag += f' colspan="{colspan}"'
10921098

1099+
text_dir = get_text_direction(content)
1100+
if text_dir == "rtl":
1101+
opening_tag += f' dir="{dir}"'
1102+
10931103
body += f"<{opening_tag}>{content}</{celltag}>"
10941104
body += "</tr>"
10951105

1106+
# dir = get_text_direction(text)
1107+
10961108
if len(text) > 0 and len(body) > 0:
1097-
body = f"<table><caption>{text}</caption><tbody>{body}</tbody></table>"
1109+
caption_text = get_html_tag_with_text_direction(
1110+
html_tag="caption", text=text
1111+
)
1112+
body = f"<table>{caption_text}<tbody>{body}</tbody></table>"
1113+
10981114
elif len(text) == 0 and len(body) > 0:
10991115
body = f"<table><tbody>{body}</tbody></table>"
11001116
elif len(text) > 0 and len(body) == 0:
1101-
body = f"<table><caption>{text}</caption></table>"
1117+
caption_text = get_html_tag_with_text_direction(
1118+
html_tag="caption", text=text
1119+
)
1120+
body = f"<table>{caption_text}</table>"
11021121
else:
11031122
body = "<table></table>"
11041123

@@ -2470,17 +2489,17 @@ def _prepare_tag_content(
24702489
continue
24712490

24722491
elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
2492+
text_inner = _prepare_tag_content(item.text)
2493+
text = get_html_tag_with_text_direction(html_tag="h1", text=text_inner)
24732494

2474-
text = f"<h1>{_prepare_tag_content(item.text)}</h1>"
24752495
html_texts.append(text)
24762496

24772497
elif isinstance(item, SectionHeaderItem):
24782498

24792499
section_level: int = min(item.level + 1, 6)
24802500

2481-
text = (
2482-
f"<h{(section_level)}>"
2483-
f"{_prepare_tag_content(item.text)}</h{(section_level)}>"
2501+
text = get_html_tag_with_text_direction(
2502+
html_tag=f"h{section_level}", text=_prepare_tag_content(item.text)
24842503
)
24852504
html_texts.append(text)
24862505

@@ -2544,13 +2563,15 @@ def _image_fallback(item: TextItem):
25442563
)
25452564

25462565
elif isinstance(item, ListItem):
2547-
2548-
text = f"<li>{_prepare_tag_content(item.text)}</li>"
2566+
text = get_html_tag_with_text_direction(
2567+
html_tag="li", text=_prepare_tag_content(item.text)
2568+
)
25492569
html_texts.append(text)
25502570

25512571
elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:
2552-
2553-
text = f"<li>{_prepare_tag_content(item.text)}</li>"
2572+
text = get_html_tag_with_text_direction(
2573+
html_tag="li", text=_prepare_tag_content(item.text)
2574+
)
25542575
html_texts.append(text)
25552576

25562577
elif isinstance(item, CodeItem):
@@ -2562,8 +2583,11 @@ def _image_fallback(item: TextItem):
25622583

25632584
elif isinstance(item, TextItem):
25642585

2565-
text = f"<p>{_prepare_tag_content(item.text)}</p>"
2586+
text = get_html_tag_with_text_direction(
2587+
html_tag="p", text=_prepare_tag_content(item.text)
2588+
)
25662589
html_texts.append(text)
2590+
25672591
elif isinstance(item, TableItem):
25682592

25692593
text = item.export_to_html(doc=self, add_caption=True)

docling_core/types/doc/utils.py

+27
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
"""Utils for document types."""
77

8+
import unicodedata
89
from pathlib import Path
910

1011

@@ -46,3 +47,29 @@ def relative_path(src: Path, target: Path) -> Path:
4647

4748
# Combine and return the result
4849
return Path(*up_segments, *down_segments)
50+
51+
52+
def get_html_tag_with_text_direction(html_tag: str, text: str) -> str:
53+
"""Form the HTML element with tag, text, and optional dir attribute."""
54+
text_dir = get_text_direction(text)
55+
56+
if text_dir == "ltr":
57+
return f"<{html_tag}>{text}</{html_tag}>"
58+
else:
59+
return f'<{html_tag} dir="{text_dir}">{text}</{html_tag}>'
60+
61+
62+
def get_text_direction(text: str) -> str:
63+
"""Determine the text direction of a given string as LTR or RTL script."""
64+
if not text:
65+
return "ltr" # Default for empty input
66+
67+
rtl_scripts = {"R", "AL"}
68+
rtl_chars = sum(unicodedata.bidirectional(c) in rtl_scripts for c in text)
69+
70+
return (
71+
"rtl"
72+
if unicodedata.bidirectional(text[0]) in rtl_scripts
73+
or rtl_chars > len(text) / 2
74+
else "ltr"
75+
)

0 commit comments

Comments
 (0)