Skip to content

Commit

Permalink
refactor: upgrade BeautifulSoup4 with type hints (#999)
Browse files Browse the repository at this point in the history
* refactor: upgrade BeautifulSoup4 with type hints

Upgrade dependency library BeautifulSoup4 to 4.13.3 (with type hints).
Refactor backends using BeautifulSoup4 to comply with type hints.
Apply style simplifications and improvements for consistency.
Remove variables and functions that are never used.
Remove code duplication between backends for parsing HTML tables.

Signed-off-by: Cesar Berrospi Ramis <[email protected]>

* build: allow beautifulsoup4 version 4.12.3

Allow older version of beautifulsoup4 and ensure compatibility.
Update library dependencies.

Signed-off-by: Cesar Berrospi Ramis <[email protected]>

---------

Signed-off-by: Cesar Berrospi Ramis <[email protected]>
  • Loading branch information
ceberam authored Feb 18, 2025
1 parent 75db611 commit 7450050
Show file tree
Hide file tree
Showing 8 changed files with 332 additions and 429 deletions.
281 changes: 142 additions & 139 deletions docling/backend/html_backend.py

Large diffs are not rendered by default.

74 changes: 6 additions & 68 deletions docling/backend/xml/jats_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,21 @@
from pathlib import Path
from typing import Final, Optional, Union

from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, Tag
from docling_core.types.doc import (
DocItemLabel,
DoclingDocument,
DocumentOrigin,
GroupItem,
GroupLabel,
NodeItem,
TableCell,
TableData,
TextItem,
)
from lxml import etree
from typing_extensions import TypedDict, override

from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument

Expand Down Expand Up @@ -540,71 +539,10 @@ def _add_table(
) -> None:
soup = BeautifulSoup(table_xml_component["content"], "html.parser")
table_tag = soup.find("table")

nested_tables = table_tag.find("table")
if nested_tables:
_log.warning(f"Skipping nested table in {str(self.file)}")
if not isinstance(table_tag, Tag):
return

# Count the number of rows (number of <tr> elements)
num_rows = len(table_tag.find_all("tr"))

# Find the number of columns (taking into account colspan)
num_cols = 0
for row in table_tag.find_all("tr"):
col_count = 0
for cell in row.find_all(["td", "th"]):
colspan = int(cell.get("colspan", 1))
col_count += colspan
num_cols = max(num_cols, col_count)

grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]

data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])

# Iterate over the rows in the table
for row_idx, row in enumerate(table_tag.find_all("tr")):
# For each row, find all the column cells (both <td> and <th>)
cells = row.find_all(["td", "th"])

# Check if each cell in the row is a header -> means it is a column header
col_header = True
for j, html_cell in enumerate(cells):
if html_cell.name == "td":
col_header = False

# Extract and print the text content of each cell
col_idx = 0
for _, html_cell in enumerate(cells):
# extract inline formulas
for formula in html_cell.find_all("inline-formula"):
math_parts = formula.text.split("$$")
if len(math_parts) == 3:
math_formula = f"$${math_parts[1]}$$"
formula.replaceWith(math_formula)
text = html_cell.text

col_span = int(html_cell.get("colspan", 1))
row_span = int(html_cell.get("rowspan", 1))

while grid[row_idx][col_idx] is not None:
col_idx += 1
for r in range(row_span):
for c in range(col_span):
grid[row_idx + r][col_idx + c] = text

cell = TableCell(
text=text,
row_span=row_span,
col_span=col_span,
start_row_offset_idx=row_idx,
end_row_offset_idx=row_idx + row_span,
start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + col_span,
col_header=col_header,
row_header=((not col_header) and html_cell.name == "th"),
)
data.table_cells.append(cell)
data = HTMLDocumentBackend.parse_table_data(table_tag)

# TODO: format label vs caption once styling is supported
label = table_xml_component["label"]
Expand All @@ -616,7 +554,8 @@ def _add_table(
else None
)

doc.add_table(data=data, parent=parent, caption=table_caption)
if data is not None:
doc.add_table(data=data, parent=parent, caption=table_caption)

return

Expand Down Expand Up @@ -673,7 +612,6 @@ def _add_title(self, doc: DoclingDocument, xml_components: XMLComponents) -> Non
def _walk_linear(
self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
) -> str:
# _log.debug(f"Walking on {node.tag} with {len(list(node))} children")
skip_tags = ["term"]
flush_tags = ["ack", "sec", "list", "boxed-text", "disp-formula", "fig"]
new_parent: NodeItem = parent
Expand Down
75 changes: 48 additions & 27 deletions docling/backend/xml/uspto_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from enum import Enum, unique
from io import BytesIO
from pathlib import Path
from typing import Any, Final, Optional, Union
from typing import Final, Optional, Union

from bs4 import BeautifulSoup, Tag
from docling_core.types.doc import (
Expand Down Expand Up @@ -1406,6 +1406,10 @@ class XmlTable:
http://oasis-open.org/specs/soextblx.dtd
"""

class ColInfo(TypedDict):
ncols: int
colinfo: list[dict]

class MinColInfoType(TypedDict):
offset: list[int]
colwidth: list[int]
Expand All @@ -1425,7 +1429,7 @@ def __init__(self, input: str) -> None:
self.empty_text = ""
self._soup = BeautifulSoup(input, features="xml")

def _create_tg_range(self, tgs: list[dict[str, Any]]) -> dict[int, ColInfoType]:
def _create_tg_range(self, tgs: list[ColInfo]) -> dict[int, ColInfoType]:
"""Create a unified range along the table groups.
Args:
Expand Down Expand Up @@ -1532,19 +1536,26 @@ def _parse_table(self, table: Tag) -> TableData:
Returns:
A docling table object.
"""
tgs_align = []
tg_secs = table.find_all("tgroup")
tgs_align: list[XmlTable.ColInfo] = []
tg_secs = table("tgroup")
if tg_secs:
for tg_sec in tg_secs:
ncols = tg_sec.get("cols", None)
if ncols:
ncols = int(ncols)
tg_align = {"ncols": ncols, "colinfo": []}
cs_secs = tg_sec.find_all("colspec")
if not isinstance(tg_sec, Tag):
continue
col_val = tg_sec.get("cols")
ncols = (
int(col_val)
if isinstance(col_val, str) and col_val.isnumeric()
else 1
)
tg_align: XmlTable.ColInfo = {"ncols": ncols, "colinfo": []}
cs_secs = tg_sec("colspec")
if cs_secs:
for cs_sec in cs_secs:
colname = cs_sec.get("colname", None)
colwidth = cs_sec.get("colwidth", None)
if not isinstance(cs_sec, Tag):
continue
colname = cs_sec.get("colname")
colwidth = cs_sec.get("colwidth")
tg_align["colinfo"].append(
{"colname": colname, "colwidth": colwidth}
)
Expand All @@ -1565,40 +1576,50 @@ def _parse_table(self, table: Tag) -> TableData:
table_data: list[TableCell] = []
i_row_global = 0
is_row_empty: bool = True
tg_secs = table.find_all("tgroup")
tg_secs = table("tgroup")
if tg_secs:
for itg, tg_sec in enumerate(tg_secs):
if not isinstance(tg_sec, Tag):
continue
tg_range = tgs_range[itg]
row_secs = tg_sec.find_all(["row", "tr"])
row_secs = tg_sec(["row", "tr"])

if row_secs:
for row_sec in row_secs:
entry_secs = row_sec.find_all(["entry", "td"])
is_header: bool = row_sec.parent.name in ["thead"]
if not isinstance(row_sec, Tag):
continue
entry_secs = row_sec(["entry", "td"])
is_header: bool = (
row_sec.parent is not None
and row_sec.parent.name == "thead"
)

ncols = 0
local_row: list[TableCell] = []
is_row_empty = True
if entry_secs:
wrong_nbr_cols = False
for ientry, entry_sec in enumerate(entry_secs):
if not isinstance(entry_sec, Tag):
continue
text = entry_sec.get_text().strip()

# start-end
namest = entry_sec.attrs.get("namest", None)
nameend = entry_sec.attrs.get("nameend", None)
if isinstance(namest, str) and namest.isnumeric():
namest = int(namest)
else:
namest = ientry + 1
namest = entry_sec.get("namest")
nameend = entry_sec.get("nameend")
start = (
int(namest)
if isinstance(namest, str) and namest.isnumeric()
else ientry + 1
)
if isinstance(nameend, str) and nameend.isnumeric():
nameend = int(nameend)
end = int(nameend)
shift = 0
else:
nameend = ientry + 2
end = ientry + 2
shift = 1

if nameend > len(tg_range["cell_offst"]):
if end > len(tg_range["cell_offst"]):
wrong_nbr_cols = True
self.nbr_messages += 1
if self.nbr_messages <= self.max_nbr_messages:
Expand All @@ -1608,8 +1629,8 @@ def _parse_table(self, table: Tag) -> TableData:
break

range_ = [
tg_range["cell_offst"][namest - 1],
tg_range["cell_offst"][nameend - 1] - shift,
tg_range["cell_offst"][start - 1],
tg_range["cell_offst"][end - 1] - shift,
]

# add row and replicate cell if needed
Expand Down Expand Up @@ -1668,7 +1689,7 @@ def parse(self) -> Optional[TableData]:
A docling table data.
"""
section = self._soup.find("table")
if section is not None:
if isinstance(section, Tag):
table = self._parse_table(section)
if table.num_rows == 0 or table.num_cols == 0:
_log.warning("The parsed USPTO table is empty")
Expand Down
Loading

0 comments on commit 7450050

Please sign in to comment.