Skip to content

Commit 7450050

Browse files
authored
refactor: upgrade BeautifulSoup4 with type hints (#999)
* refactor: upgrade BeautifulSoup4 with type hints Upgrade dependency library BeautifulSoup4 to 4.13.3 (with type hints). Refactor backends using BeautifulSoup4 to comply with type hints. Apply style simplifications and improvements for consistency. Remove variables and functions that are never used. Remove code duplication between backends for parsing HTML tables. Signed-off-by: Cesar Berrospi Ramis <[email protected]> * build: allow beautifulsoup4 version 4.12.3 Allow older version of beautifulsoup4 and ensure compatibility. Update library dependencies. Signed-off-by: Cesar Berrospi Ramis <[email protected]> --------- Signed-off-by: Cesar Berrospi Ramis <[email protected]>
1 parent 75db611 commit 7450050

File tree

8 files changed

+332
-429
lines changed

8 files changed

+332
-429
lines changed

docling/backend/html_backend.py

Lines changed: 142 additions & 139 deletions
Large diffs are not rendered by default.

docling/backend/xml/jats_backend.py

Lines changed: 6 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -4,22 +4,21 @@
44
from pathlib import Path
55
from typing import Final, Optional, Union
66

7-
from bs4 import BeautifulSoup
7+
from bs4 import BeautifulSoup, Tag
88
from docling_core.types.doc import (
99
DocItemLabel,
1010
DoclingDocument,
1111
DocumentOrigin,
1212
GroupItem,
1313
GroupLabel,
1414
NodeItem,
15-
TableCell,
16-
TableData,
1715
TextItem,
1816
)
1917
from lxml import etree
2018
from typing_extensions import TypedDict, override
2119

2220
from docling.backend.abstract_backend import DeclarativeDocumentBackend
21+
from docling.backend.html_backend import HTMLDocumentBackend
2322
from docling.datamodel.base_models import InputFormat
2423
from docling.datamodel.document import InputDocument
2524

@@ -540,71 +539,10 @@ def _add_table(
540539
) -> None:
541540
soup = BeautifulSoup(table_xml_component["content"], "html.parser")
542541
table_tag = soup.find("table")
543-
544-
nested_tables = table_tag.find("table")
545-
if nested_tables:
546-
_log.warning(f"Skipping nested table in {str(self.file)}")
542+
if not isinstance(table_tag, Tag):
547543
return
548544

549-
# Count the number of rows (number of <tr> elements)
550-
num_rows = len(table_tag.find_all("tr"))
551-
552-
# Find the number of columns (taking into account colspan)
553-
num_cols = 0
554-
for row in table_tag.find_all("tr"):
555-
col_count = 0
556-
for cell in row.find_all(["td", "th"]):
557-
colspan = int(cell.get("colspan", 1))
558-
col_count += colspan
559-
num_cols = max(num_cols, col_count)
560-
561-
grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
562-
563-
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
564-
565-
# Iterate over the rows in the table
566-
for row_idx, row in enumerate(table_tag.find_all("tr")):
567-
# For each row, find all the column cells (both <td> and <th>)
568-
cells = row.find_all(["td", "th"])
569-
570-
# Check if each cell in the row is a header -> means it is a column header
571-
col_header = True
572-
for j, html_cell in enumerate(cells):
573-
if html_cell.name == "td":
574-
col_header = False
575-
576-
# Extract and print the text content of each cell
577-
col_idx = 0
578-
for _, html_cell in enumerate(cells):
579-
# extract inline formulas
580-
for formula in html_cell.find_all("inline-formula"):
581-
math_parts = formula.text.split("$$")
582-
if len(math_parts) == 3:
583-
math_formula = f"$${math_parts[1]}$$"
584-
formula.replaceWith(math_formula)
585-
text = html_cell.text
586-
587-
col_span = int(html_cell.get("colspan", 1))
588-
row_span = int(html_cell.get("rowspan", 1))
589-
590-
while grid[row_idx][col_idx] is not None:
591-
col_idx += 1
592-
for r in range(row_span):
593-
for c in range(col_span):
594-
grid[row_idx + r][col_idx + c] = text
595-
596-
cell = TableCell(
597-
text=text,
598-
row_span=row_span,
599-
col_span=col_span,
600-
start_row_offset_idx=row_idx,
601-
end_row_offset_idx=row_idx + row_span,
602-
start_col_offset_idx=col_idx,
603-
end_col_offset_idx=col_idx + col_span,
604-
col_header=col_header,
605-
row_header=((not col_header) and html_cell.name == "th"),
606-
)
607-
data.table_cells.append(cell)
545+
data = HTMLDocumentBackend.parse_table_data(table_tag)
608546

609547
# TODO: format label vs caption once styling is supported
610548
label = table_xml_component["label"]
@@ -616,7 +554,8 @@ def _add_table(
616554
else None
617555
)
618556

619-
doc.add_table(data=data, parent=parent, caption=table_caption)
557+
if data is not None:
558+
doc.add_table(data=data, parent=parent, caption=table_caption)
620559

621560
return
622561

@@ -673,7 +612,6 @@ def _add_title(self, doc: DoclingDocument, xml_components: XMLComponents) -> Non
673612
def _walk_linear(
674613
self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
675614
) -> str:
676-
# _log.debug(f"Walking on {node.tag} with {len(list(node))} children")
677615
skip_tags = ["term"]
678616
flush_tags = ["ack", "sec", "list", "boxed-text", "disp-formula", "fig"]
679617
new_parent: NodeItem = parent

docling/backend/xml/uspto_backend.py

Lines changed: 48 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from enum import Enum, unique
1515
from io import BytesIO
1616
from pathlib import Path
17-
from typing import Any, Final, Optional, Union
17+
from typing import Final, Optional, Union
1818

1919
from bs4 import BeautifulSoup, Tag
2020
from docling_core.types.doc import (
@@ -1406,6 +1406,10 @@ class XmlTable:
14061406
http://oasis-open.org/specs/soextblx.dtd
14071407
"""
14081408

1409+
class ColInfo(TypedDict):
1410+
ncols: int
1411+
colinfo: list[dict]
1412+
14091413
class MinColInfoType(TypedDict):
14101414
offset: list[int]
14111415
colwidth: list[int]
@@ -1425,7 +1429,7 @@ def __init__(self, input: str) -> None:
14251429
self.empty_text = ""
14261430
self._soup = BeautifulSoup(input, features="xml")
14271431

1428-
def _create_tg_range(self, tgs: list[dict[str, Any]]) -> dict[int, ColInfoType]:
1432+
def _create_tg_range(self, tgs: list[ColInfo]) -> dict[int, ColInfoType]:
14291433
"""Create a unified range along the table groups.
14301434
14311435
Args:
@@ -1532,19 +1536,26 @@ def _parse_table(self, table: Tag) -> TableData:
15321536
Returns:
15331537
A docling table object.
15341538
"""
1535-
tgs_align = []
1536-
tg_secs = table.find_all("tgroup")
1539+
tgs_align: list[XmlTable.ColInfo] = []
1540+
tg_secs = table("tgroup")
15371541
if tg_secs:
15381542
for tg_sec in tg_secs:
1539-
ncols = tg_sec.get("cols", None)
1540-
if ncols:
1541-
ncols = int(ncols)
1542-
tg_align = {"ncols": ncols, "colinfo": []}
1543-
cs_secs = tg_sec.find_all("colspec")
1543+
if not isinstance(tg_sec, Tag):
1544+
continue
1545+
col_val = tg_sec.get("cols")
1546+
ncols = (
1547+
int(col_val)
1548+
if isinstance(col_val, str) and col_val.isnumeric()
1549+
else 1
1550+
)
1551+
tg_align: XmlTable.ColInfo = {"ncols": ncols, "colinfo": []}
1552+
cs_secs = tg_sec("colspec")
15441553
if cs_secs:
15451554
for cs_sec in cs_secs:
1546-
colname = cs_sec.get("colname", None)
1547-
colwidth = cs_sec.get("colwidth", None)
1555+
if not isinstance(cs_sec, Tag):
1556+
continue
1557+
colname = cs_sec.get("colname")
1558+
colwidth = cs_sec.get("colwidth")
15481559
tg_align["colinfo"].append(
15491560
{"colname": colname, "colwidth": colwidth}
15501561
)
@@ -1565,40 +1576,50 @@ def _parse_table(self, table: Tag) -> TableData:
15651576
table_data: list[TableCell] = []
15661577
i_row_global = 0
15671578
is_row_empty: bool = True
1568-
tg_secs = table.find_all("tgroup")
1579+
tg_secs = table("tgroup")
15691580
if tg_secs:
15701581
for itg, tg_sec in enumerate(tg_secs):
1582+
if not isinstance(tg_sec, Tag):
1583+
continue
15711584
tg_range = tgs_range[itg]
1572-
row_secs = tg_sec.find_all(["row", "tr"])
1585+
row_secs = tg_sec(["row", "tr"])
15731586

15741587
if row_secs:
15751588
for row_sec in row_secs:
1576-
entry_secs = row_sec.find_all(["entry", "td"])
1577-
is_header: bool = row_sec.parent.name in ["thead"]
1589+
if not isinstance(row_sec, Tag):
1590+
continue
1591+
entry_secs = row_sec(["entry", "td"])
1592+
is_header: bool = (
1593+
row_sec.parent is not None
1594+
and row_sec.parent.name == "thead"
1595+
)
15781596

15791597
ncols = 0
15801598
local_row: list[TableCell] = []
15811599
is_row_empty = True
15821600
if entry_secs:
15831601
wrong_nbr_cols = False
15841602
for ientry, entry_sec in enumerate(entry_secs):
1603+
if not isinstance(entry_sec, Tag):
1604+
continue
15851605
text = entry_sec.get_text().strip()
15861606

15871607
# start-end
1588-
namest = entry_sec.attrs.get("namest", None)
1589-
nameend = entry_sec.attrs.get("nameend", None)
1590-
if isinstance(namest, str) and namest.isnumeric():
1591-
namest = int(namest)
1592-
else:
1593-
namest = ientry + 1
1608+
namest = entry_sec.get("namest")
1609+
nameend = entry_sec.get("nameend")
1610+
start = (
1611+
int(namest)
1612+
if isinstance(namest, str) and namest.isnumeric()
1613+
else ientry + 1
1614+
)
15941615
if isinstance(nameend, str) and nameend.isnumeric():
1595-
nameend = int(nameend)
1616+
end = int(nameend)
15961617
shift = 0
15971618
else:
1598-
nameend = ientry + 2
1619+
end = ientry + 2
15991620
shift = 1
16001621

1601-
if nameend > len(tg_range["cell_offst"]):
1622+
if end > len(tg_range["cell_offst"]):
16021623
wrong_nbr_cols = True
16031624
self.nbr_messages += 1
16041625
if self.nbr_messages <= self.max_nbr_messages:
@@ -1608,8 +1629,8 @@ def _parse_table(self, table: Tag) -> TableData:
16081629
break
16091630

16101631
range_ = [
1611-
tg_range["cell_offst"][namest - 1],
1612-
tg_range["cell_offst"][nameend - 1] - shift,
1632+
tg_range["cell_offst"][start - 1],
1633+
tg_range["cell_offst"][end - 1] - shift,
16131634
]
16141635

16151636
# add row and replicate cell if needed
@@ -1668,7 +1689,7 @@ def parse(self) -> Optional[TableData]:
16681689
A docling table data.
16691690
"""
16701691
section = self._soup.find("table")
1671-
if section is not None:
1692+
if isinstance(section, Tag):
16721693
table = self._parse_table(section)
16731694
if table.num_rows == 0 or table.num_cols == 0:
16741695
_log.warning("The parsed USPTO table is empty")

0 commit comments

Comments
 (0)