fix: Define LTR/RTL text direction in HTML export (#152)

cau-git · PeterStaar-IBM · web-flow · commit 3cf31cbe384e · 2025-02-05T13:37:39.000+01:00
* fix: Detect RTL text and put according HTML tags

Signed-off-by: Christoph Auer &lt;cau@zurich.ibm.com&gt;

* proposal for cau/rtl-text

Signed-off-by: Peter Staar &lt;taa@zurich.ibm.com&gt;

* fix: Form HTML tags with utility method

Signed-off-by: Christoph Auer &lt;cau@zurich.ibm.com&gt;

* Update tests

Signed-off-by: Christoph Auer &lt;cau@zurich.ibm.com&gt;

* Remove commented code

Signed-off-by: Christoph Auer &lt;cau@zurich.ibm.com&gt;

* Add back escaping

Signed-off-by: Christoph Auer &lt;cau@zurich.ibm.com&gt;

---------

Signed-off-by: Christoph Auer &lt;cau@zurich.ibm.com&gt;
Signed-off-by: Peter Staar &lt;taa@zurich.ibm.com&gt;
Co-authored-by: Peter Staar &lt;taa@zurich.ibm.com&gt;
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
@@ -44,7 +44,11 @@
 from docling_core.types.doc.base import ImageRefMode
 from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel, GroupLabel
 from docling_core.types.doc.tokens import DocumentToken, TableToken
-from docling_core.types.doc.utils import relative_path
+from docling_core.types.doc.utils import (
+    get_html_tag_with_text_direction,
+    get_text_direction,
+    relative_path,
+)
 
 _logger = logging.getLogger(__name__)
 
@@ -866,7 +870,9 @@ def export_to_html(
 
         caption_text = ""
         if len(text) > 0:
-            caption_text = f"<figcaption>{text}</figcaption>"
+            caption_text = get_html_tag_with_text_direction(
+                html_tag="figcaption", text=text
+            )
 
         default_response = f"<figure>{caption_text}</figure>"
 
@@ -1090,15 +1096,28 @@ def export_to_html(
                 if colspan > 1:
                     opening_tag += f' colspan="{colspan}"'
 
+                text_dir = get_text_direction(content)
+                if text_dir == "rtl":
+                    opening_tag += f' dir="{dir}"'
+
                 body += f"<{opening_tag}>{content}</{celltag}>"
             body += "</tr>"
 
+        # dir = get_text_direction(text)
+
         if len(text) > 0 and len(body) > 0:
-            body = f"<table><caption>{text}</caption><tbody>{body}</tbody></table>"
+            caption_text = get_html_tag_with_text_direction(
+                html_tag="caption", text=text
+            )
+            body = f"<table>{caption_text}<tbody>{body}</tbody></table>"
+
         elif len(text) == 0 and len(body) > 0:
             body = f"<table><tbody>{body}</tbody></table>"
         elif len(text) > 0 and len(body) == 0:
-            body = f"<table><caption>{text}</caption></table>"
+            caption_text = get_html_tag_with_text_direction(
+                html_tag="caption", text=text
+            )
+            body = f"<table>{caption_text}</table>"
         else:
             body = "<table></table>"
 
@@ -2470,17 +2489,17 @@ def _prepare_tag_content(
                 continue
 
             elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
+                text_inner = _prepare_tag_content(item.text)
+                text = get_html_tag_with_text_direction(html_tag="h1", text=text_inner)
 
-                text = f"<h1>{_prepare_tag_content(item.text)}</h1>"
                 html_texts.append(text)
 
             elif isinstance(item, SectionHeaderItem):
 
                 section_level: int = min(item.level + 1, 6)
 
-                text = (
-                    f"<h{(section_level)}>"
-                    f"{_prepare_tag_content(item.text)}</h{(section_level)}>"
+                text = get_html_tag_with_text_direction(
+                    html_tag=f"h{section_level}", text=_prepare_tag_content(item.text)
                 )
                 html_texts.append(text)
 
@@ -2544,13 +2563,15 @@ def _image_fallback(item: TextItem):
                     )
 
             elif isinstance(item, ListItem):
-
-                text = f"<li>{_prepare_tag_content(item.text)}</li>"
+                text = get_html_tag_with_text_direction(
+                    html_tag="li", text=_prepare_tag_content(item.text)
+                )
                 html_texts.append(text)
 
             elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:
-
-                text = f"<li>{_prepare_tag_content(item.text)}</li>"
+                text = get_html_tag_with_text_direction(
+                    html_tag="li", text=_prepare_tag_content(item.text)
+                )
                 html_texts.append(text)
 
             elif isinstance(item, CodeItem):
@@ -2562,8 +2583,11 @@ def _image_fallback(item: TextItem):
 
             elif isinstance(item, TextItem):
 
-                text = f"<p>{_prepare_tag_content(item.text)}</p>"
+                text = get_html_tag_with_text_direction(
+                    html_tag="p", text=_prepare_tag_content(item.text)
+                )
                 html_texts.append(text)
+
             elif isinstance(item, TableItem):
 
                 text = item.export_to_html(doc=self, add_caption=True)
diff --git a/docling_core/types/doc/utils.py b/docling_core/types/doc/utils.py
@@ -5,6 +5,7 @@
 
 """Utils for document types."""
 
+import unicodedata
 from pathlib import Path
 
 
@@ -46,3 +47,29 @@ def relative_path(src: Path, target: Path) -> Path:
 
     # Combine and return the result
     return Path(*up_segments, *down_segments)
+
+
+def get_html_tag_with_text_direction(html_tag: str, text: str) -> str:
+    """Form the HTML element with tag, text, and optional dir attribute."""
+    text_dir = get_text_direction(text)
+
+    if text_dir == "ltr":
+        return f"<{html_tag}>{text}</{html_tag}>"
+    else:
+        return f'<{html_tag} dir="{text_dir}">{text}</{html_tag}>'
+
+
+def get_text_direction(text: str) -> str:
+    """Determine the text direction of a given string as LTR or RTL script."""
+    if not text:
+        return "ltr"  # Default for empty input
+
+    rtl_scripts = {"R", "AL"}
+    rtl_chars = sum(unicodedata.bidirectional(c) in rtl_scripts for c in text)
+
+    return (
+        "rtl"
+        if unicodedata.bidirectional(text[0]) in rtl_scripts
+        or rtl_chars > len(text) / 2
+        else "ltr"
+    )