refactor: upgrade BeautifulSoup4 with type hints (#999)

* refactor: upgrade BeautifulSoup4 with type hints Upgrade dependency library BeautifulSoup4 to 4.13.3 (with type hints). Refactor backends using BeautifulSoup4 to comply with type hints. Apply style simplifications and improvements for consistency. Remove variables and functions that are never used. Remove code duplication between backends for parsing HTML tables. Signed-off-by: Cesar Berrospi Ramis <[email protected]> * build: allow beautifulsoup4 version 4.12.3 Allow older version of beautifulsoup4 and ensure compatibility. Update library dependencies. Signed-off-by: Cesar Berrospi Ramis <[email protected]> --------- Signed-off-by: Cesar Berrospi Ramis <[email protected]>
DS4SD · Feb 18, 2025 · 7450050 · 7450050
1 parent 75db611
commit 7450050
Show file tree

Hide file tree

Showing 8 changed files with 332 additions and 429 deletions.
diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
diff --git a/docling/backend/xml/jats_backend.py b/docling/backend/xml/jats_backend.py
@@ -4,22 +4,21 @@
 from pathlib import Path
 from typing import Final, Optional, Union
 
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, Tag
 from docling_core.types.doc import (
     DocItemLabel,
     DoclingDocument,
     DocumentOrigin,
     GroupItem,
     GroupLabel,
     NodeItem,
-    TableCell,
-    TableData,
     TextItem,
 )
 from lxml import etree
 from typing_extensions import TypedDict, override
 
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.backend.html_backend import HTMLDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument
 
@@ -540,71 +539,10 @@ def _add_table(
     ) -> None:
         soup = BeautifulSoup(table_xml_component["content"], "html.parser")
         table_tag = soup.find("table")
-
-        nested_tables = table_tag.find("table")
-        if nested_tables:
-            _log.warning(f"Skipping nested table in {str(self.file)}")
+        if not isinstance(table_tag, Tag):
             return
 
-        # Count the number of rows (number of <tr> elements)
-        num_rows = len(table_tag.find_all("tr"))
-
-        # Find the number of columns (taking into account colspan)
-        num_cols = 0
-        for row in table_tag.find_all("tr"):
-            col_count = 0
-            for cell in row.find_all(["td", "th"]):
-                colspan = int(cell.get("colspan", 1))
-                col_count += colspan
-            num_cols = max(num_cols, col_count)
-
-        grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
-
-        data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
-
-        # Iterate over the rows in the table
-        for row_idx, row in enumerate(table_tag.find_all("tr")):
-            # For each row, find all the column cells (both <td> and <th>)
-            cells = row.find_all(["td", "th"])
-
-            # Check if each cell in the row is a header -> means it is a column header
-            col_header = True
-            for j, html_cell in enumerate(cells):
-                if html_cell.name == "td":
-                    col_header = False
-
-            # Extract and print the text content of each cell
-            col_idx = 0
-            for _, html_cell in enumerate(cells):
-                # extract inline formulas
-                for formula in html_cell.find_all("inline-formula"):
-                    math_parts = formula.text.split("$$")
-                    if len(math_parts) == 3:
-                        math_formula = f"$${math_parts[1]}$$"
-                        formula.replaceWith(math_formula)
-                text = html_cell.text
-
-                col_span = int(html_cell.get("colspan", 1))
-                row_span = int(html_cell.get("rowspan", 1))
-
-                while grid[row_idx][col_idx] is not None:
-                    col_idx += 1
-                for r in range(row_span):
-                    for c in range(col_span):
-                        grid[row_idx + r][col_idx + c] = text
-
-                cell = TableCell(
-                    text=text,
-                    row_span=row_span,
-                    col_span=col_span,
-                    start_row_offset_idx=row_idx,
-                    end_row_offset_idx=row_idx + row_span,
-                    start_col_offset_idx=col_idx,
-                    end_col_offset_idx=col_idx + col_span,
-                    col_header=col_header,
-                    row_header=((not col_header) and html_cell.name == "th"),
-                )
-                data.table_cells.append(cell)
+        data = HTMLDocumentBackend.parse_table_data(table_tag)
 
         # TODO: format label vs caption once styling is supported
         label = table_xml_component["label"]
@@ -616,7 +554,8 @@ def _add_table(
             else None
         )
 
-        doc.add_table(data=data, parent=parent, caption=table_caption)
+        if data is not None:
+            doc.add_table(data=data, parent=parent, caption=table_caption)
 
         return
 
@@ -673,7 +612,6 @@ def _add_title(self, doc: DoclingDocument, xml_components: XMLComponents) -> Non
     def _walk_linear(
         self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
     ) -> str:
-        # _log.debug(f"Walking on {node.tag} with {len(list(node))} children")
         skip_tags = ["term"]
         flush_tags = ["ack", "sec", "list", "boxed-text", "disp-formula", "fig"]
         new_parent: NodeItem = parent

diff --git a/docling/backend/xml/uspto_backend.py b/docling/backend/xml/uspto_backend.py
@@ -14,7 +14,7 @@
 from enum import Enum, unique
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Final, Optional, Union
+from typing import Final, Optional, Union
 
 from bs4 import BeautifulSoup, Tag
 from docling_core.types.doc import (
@@ -1406,6 +1406,10 @@ class XmlTable:
     http://oasis-open.org/specs/soextblx.dtd
     """
 
+    class ColInfo(TypedDict):
+        ncols: int
+        colinfo: list[dict]
+
     class MinColInfoType(TypedDict):
         offset: list[int]
         colwidth: list[int]
@@ -1425,7 +1429,7 @@ def __init__(self, input: str) -> None:
         self.empty_text = ""
         self._soup = BeautifulSoup(input, features="xml")
 
-    def _create_tg_range(self, tgs: list[dict[str, Any]]) -> dict[int, ColInfoType]:
+    def _create_tg_range(self, tgs: list[ColInfo]) -> dict[int, ColInfoType]:
         """Create a unified range along the table groups.
 
         Args:
@@ -1532,19 +1536,26 @@ def _parse_table(self, table: Tag) -> TableData:
         Returns:
             A docling table object.
         """
-        tgs_align = []
-        tg_secs = table.find_all("tgroup")
+        tgs_align: list[XmlTable.ColInfo] = []
+        tg_secs = table("tgroup")
         if tg_secs:
             for tg_sec in tg_secs:
-                ncols = tg_sec.get("cols", None)
-                if ncols:
-                    ncols = int(ncols)
-                tg_align = {"ncols": ncols, "colinfo": []}
-                cs_secs = tg_sec.find_all("colspec")
+                if not isinstance(tg_sec, Tag):
+                    continue
+                col_val = tg_sec.get("cols")
+                ncols = (
+                    int(col_val)
+                    if isinstance(col_val, str) and col_val.isnumeric()
+                    else 1
+                )
+                tg_align: XmlTable.ColInfo = {"ncols": ncols, "colinfo": []}
+                cs_secs = tg_sec("colspec")
                 if cs_secs:
                     for cs_sec in cs_secs:
-                        colname = cs_sec.get("colname", None)
-                        colwidth = cs_sec.get("colwidth", None)
+                        if not isinstance(cs_sec, Tag):
+                            continue
+                        colname = cs_sec.get("colname")
+                        colwidth = cs_sec.get("colwidth")
                         tg_align["colinfo"].append(
                             {"colname": colname, "colwidth": colwidth}
                         )
@@ -1565,40 +1576,50 @@ def _parse_table(self, table: Tag) -> TableData:
         table_data: list[TableCell] = []
         i_row_global = 0
         is_row_empty: bool = True
-        tg_secs = table.find_all("tgroup")
+        tg_secs = table("tgroup")
         if tg_secs:
             for itg, tg_sec in enumerate(tg_secs):
+                if not isinstance(tg_sec, Tag):
+                    continue
                 tg_range = tgs_range[itg]
-                row_secs = tg_sec.find_all(["row", "tr"])
+                row_secs = tg_sec(["row", "tr"])
 
                 if row_secs:
                     for row_sec in row_secs:
-                        entry_secs = row_sec.find_all(["entry", "td"])
-                        is_header: bool = row_sec.parent.name in ["thead"]
+                        if not isinstance(row_sec, Tag):
+                            continue
+                        entry_secs = row_sec(["entry", "td"])
+                        is_header: bool = (
+                            row_sec.parent is not None
+                            and row_sec.parent.name == "thead"
+                        )
 
                         ncols = 0
                         local_row: list[TableCell] = []
                         is_row_empty = True
                         if entry_secs:
                             wrong_nbr_cols = False
                             for ientry, entry_sec in enumerate(entry_secs):
+                                if not isinstance(entry_sec, Tag):
+                                    continue
                                 text = entry_sec.get_text().strip()
 
                                 # start-end
-                                namest = entry_sec.attrs.get("namest", None)
-                                nameend = entry_sec.attrs.get("nameend", None)
-                                if isinstance(namest, str) and namest.isnumeric():
-                                    namest = int(namest)
-                                else:
-                                    namest = ientry + 1
+                                namest = entry_sec.get("namest")
+                                nameend = entry_sec.get("nameend")
+                                start = (
+                                    int(namest)
+                                    if isinstance(namest, str) and namest.isnumeric()
+                                    else ientry + 1
+                                )
                                 if isinstance(nameend, str) and nameend.isnumeric():
-                                    nameend = int(nameend)
+                                    end = int(nameend)
                                     shift = 0
                                 else:
-                                    nameend = ientry + 2
+                                    end = ientry + 2
                                     shift = 1
 
-                                if nameend > len(tg_range["cell_offst"]):
+                                if end > len(tg_range["cell_offst"]):
                                     wrong_nbr_cols = True
                                     self.nbr_messages += 1
                                     if self.nbr_messages <= self.max_nbr_messages:
@@ -1608,8 +1629,8 @@ def _parse_table(self, table: Tag) -> TableData:
                                     break
 
                                 range_ = [
-                                    tg_range["cell_offst"][namest - 1],
-                                    tg_range["cell_offst"][nameend - 1] - shift,
+                                    tg_range["cell_offst"][start - 1],
+                                    tg_range["cell_offst"][end - 1] - shift,
                                 ]
 
                                 # add row and replicate cell if needed
@@ -1668,7 +1689,7 @@ def parse(self) -> Optional[TableData]:
             A docling table data.
         """
         section = self._soup.find("table")
-        if section is not None:
+        if isinstance(section, Tag):
             table = self._parse_table(section)
             if table.num_rows == 0 or table.num_cols == 0:
                 _log.warning("The parsed USPTO table is empty")