Skip to content

Commit d788bf2

Browse files
committed
Merge from main
Signed-off-by: Christoph Auer <[email protected]>
2 parents 8606b59 + 7450050 commit d788bf2

File tree

8 files changed

+331
-428
lines changed

8 files changed

+331
-428
lines changed

docling/backend/html_backend.py

Lines changed: 142 additions & 139 deletions
Large diffs are not rendered by default.

docling/backend/xml/jats_backend.py

Lines changed: 6 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -4,22 +4,21 @@
44
from pathlib import Path
55
from typing import Final, Optional, Union
66

7-
from bs4 import BeautifulSoup
7+
from bs4 import BeautifulSoup, Tag
88
from docling_core.types.doc import (
99
DocItemLabel,
1010
DoclingDocument,
1111
DocumentOrigin,
1212
GroupItem,
1313
GroupLabel,
1414
NodeItem,
15-
TableCell,
16-
TableData,
1715
TextItem,
1816
)
1917
from lxml import etree
2018
from typing_extensions import TypedDict, override
2119

2220
from docling.backend.abstract_backend import DeclarativeDocumentBackend
21+
from docling.backend.html_backend import HTMLDocumentBackend
2322
from docling.datamodel.base_models import InputFormat
2423
from docling.datamodel.document import InputDocument
2524

@@ -540,71 +539,10 @@ def _add_table(
540539
) -> None:
541540
soup = BeautifulSoup(table_xml_component["content"], "html.parser")
542541
table_tag = soup.find("table")
543-
544-
nested_tables = table_tag.find("table")
545-
if nested_tables:
546-
_log.warning(f"Skipping nested table in {str(self.file)}")
542+
if not isinstance(table_tag, Tag):
547543
return
548544

549-
# Count the number of rows (number of <tr> elements)
550-
num_rows = len(table_tag.find_all("tr"))
551-
552-
# Find the number of columns (taking into account colspan)
553-
num_cols = 0
554-
for row in table_tag.find_all("tr"):
555-
col_count = 0
556-
for cell in row.find_all(["td", "th"]):
557-
colspan = int(cell.get("colspan", 1))
558-
col_count += colspan
559-
num_cols = max(num_cols, col_count)
560-
561-
grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
562-
563-
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
564-
565-
# Iterate over the rows in the table
566-
for row_idx, row in enumerate(table_tag.find_all("tr")):
567-
# For each row, find all the column cells (both <td> and <th>)
568-
cells = row.find_all(["td", "th"])
569-
570-
# Check if each cell in the row is a header -> means it is a column header
571-
col_header = True
572-
for j, html_cell in enumerate(cells):
573-
if html_cell.name == "td":
574-
col_header = False
575-
576-
# Extract and print the text content of each cell
577-
col_idx = 0
578-
for _, html_cell in enumerate(cells):
579-
# extract inline formulas
580-
for formula in html_cell.find_all("inline-formula"):
581-
math_parts = formula.text.split("$$")
582-
if len(math_parts) == 3:
583-
math_formula = f"$${math_parts[1]}$$"
584-
formula.replaceWith(math_formula)
585-
text = html_cell.text
586-
587-
col_span = int(html_cell.get("colspan", 1))
588-
row_span = int(html_cell.get("rowspan", 1))
589-
590-
while grid[row_idx][col_idx] is not None:
591-
col_idx += 1
592-
for r in range(row_span):
593-
for c in range(col_span):
594-
grid[row_idx + r][col_idx + c] = text
595-
596-
cell = TableCell(
597-
text=text,
598-
row_span=row_span,
599-
col_span=col_span,
600-
start_row_offset_idx=row_idx,
601-
end_row_offset_idx=row_idx + row_span,
602-
start_col_offset_idx=col_idx,
603-
end_col_offset_idx=col_idx + col_span,
604-
col_header=col_header,
605-
row_header=((not col_header) and html_cell.name == "th"),
606-
)
607-
data.table_cells.append(cell)
545+
data = HTMLDocumentBackend.parse_table_data(table_tag)
608546

609547
# TODO: format label vs caption once styling is supported
610548
label = table_xml_component["label"]
@@ -616,7 +554,8 @@ def _add_table(
616554
else None
617555
)
618556

619-
doc.add_table(data=data, parent=parent, caption=table_caption)
557+
if data is not None:
558+
doc.add_table(data=data, parent=parent, caption=table_caption)
620559

621560
return
622561

@@ -673,7 +612,6 @@ def _add_title(self, doc: DoclingDocument, xml_components: XMLComponents) -> Non
673612
def _walk_linear(
674613
self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
675614
) -> str:
676-
# _log.debug(f"Walking on {node.tag} with {len(list(node))} children")
677615
skip_tags = ["term"]
678616
flush_tags = ["ack", "sec", "list", "boxed-text", "disp-formula", "fig"]
679617
new_parent: NodeItem = parent

docling/backend/xml/uspto_backend.py

Lines changed: 48 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from enum import Enum, unique
1515
from io import BytesIO
1616
from pathlib import Path
17-
from typing import Any, Final, Optional, Union
17+
from typing import Final, Optional, Union
1818

1919
from bs4 import BeautifulSoup, Tag
2020
from docling_core.types.doc import (
@@ -1406,6 +1406,10 @@ class XmlTable:
14061406
http://oasis-open.org/specs/soextblx.dtd
14071407
"""
14081408

1409+
class ColInfo(TypedDict):
1410+
ncols: int
1411+
colinfo: list[dict]
1412+
14091413
class MinColInfoType(TypedDict):
14101414
offset: list[int]
14111415
colwidth: list[int]
@@ -1425,7 +1429,7 @@ def __init__(self, input: str) -> None:
14251429
self.empty_text = ""
14261430
self._soup = BeautifulSoup(input, features="xml")
14271431

1428-
def _create_tg_range(self, tgs: list[dict[str, Any]]) -> dict[int, ColInfoType]:
1432+
def _create_tg_range(self, tgs: list[ColInfo]) -> dict[int, ColInfoType]:
14291433
"""Create a unified range along the table groups.
14301434
14311435
Args:
@@ -1532,19 +1536,26 @@ def _parse_table(self, table: Tag) -> TableData:
15321536
Returns:
15331537
A docling table object.
15341538
"""
1535-
tgs_align = []
1536-
tg_secs = table.find_all("tgroup")
1539+
tgs_align: list[XmlTable.ColInfo] = []
1540+
tg_secs = table("tgroup")
15371541
if tg_secs:
15381542
for tg_sec in tg_secs:
1539-
ncols = tg_sec.get("cols", None)
1540-
if ncols:
1541-
ncols = int(ncols)
1542-
tg_align = {"ncols": ncols, "colinfo": []}
1543-
cs_secs = tg_sec.find_all("colspec")
1543+
if not isinstance(tg_sec, Tag):
1544+
continue
1545+
col_val = tg_sec.get("cols")
1546+
ncols = (
1547+
int(col_val)
1548+
if isinstance(col_val, str) and col_val.isnumeric()
1549+
else 1
1550+
)
1551+
tg_align: XmlTable.ColInfo = {"ncols": ncols, "colinfo": []}
1552+
cs_secs = tg_sec("colspec")
15441553
if cs_secs:
15451554
for cs_sec in cs_secs:
1546-
colname = cs_sec.get("colname", None)
1547-
colwidth = cs_sec.get("colwidth", None)
1555+
if not isinstance(cs_sec, Tag):
1556+
continue
1557+
colname = cs_sec.get("colname")
1558+
colwidth = cs_sec.get("colwidth")
15481559
tg_align["colinfo"].append(
15491560
{"colname": colname, "colwidth": colwidth}
15501561
)
@@ -1565,40 +1576,50 @@ def _parse_table(self, table: Tag) -> TableData:
15651576
table_data: list[TableCell] = []
15661577
i_row_global = 0
15671578
is_row_empty: bool = True
1568-
tg_secs = table.find_all("tgroup")
1579+
tg_secs = table("tgroup")
15691580
if tg_secs:
15701581
for itg, tg_sec in enumerate(tg_secs):
1582+
if not isinstance(tg_sec, Tag):
1583+
continue
15711584
tg_range = tgs_range[itg]
1572-
row_secs = tg_sec.find_all(["row", "tr"])
1585+
row_secs = tg_sec(["row", "tr"])
15731586

15741587
if row_secs:
15751588
for row_sec in row_secs:
1576-
entry_secs = row_sec.find_all(["entry", "td"])
1577-
is_header: bool = row_sec.parent.name in ["thead"]
1589+
if not isinstance(row_sec, Tag):
1590+
continue
1591+
entry_secs = row_sec(["entry", "td"])
1592+
is_header: bool = (
1593+
row_sec.parent is not None
1594+
and row_sec.parent.name == "thead"
1595+
)
15781596

15791597
ncols = 0
15801598
local_row: list[TableCell] = []
15811599
is_row_empty = True
15821600
if entry_secs:
15831601
wrong_nbr_cols = False
15841602
for ientry, entry_sec in enumerate(entry_secs):
1603+
if not isinstance(entry_sec, Tag):
1604+
continue
15851605
text = entry_sec.get_text().strip()
15861606

15871607
# start-end
1588-
namest = entry_sec.attrs.get("namest", None)
1589-
nameend = entry_sec.attrs.get("nameend", None)
1590-
if isinstance(namest, str) and namest.isnumeric():
1591-
namest = int(namest)
1592-
else:
1593-
namest = ientry + 1
1608+
namest = entry_sec.get("namest")
1609+
nameend = entry_sec.get("nameend")
1610+
start = (
1611+
int(namest)
1612+
if isinstance(namest, str) and namest.isnumeric()
1613+
else ientry + 1
1614+
)
15941615
if isinstance(nameend, str) and nameend.isnumeric():
1595-
nameend = int(nameend)
1616+
end = int(nameend)
15961617
shift = 0
15971618
else:
1598-
nameend = ientry + 2
1619+
end = ientry + 2
15991620
shift = 1
16001621

1601-
if nameend > len(tg_range["cell_offst"]):
1622+
if end > len(tg_range["cell_offst"]):
16021623
wrong_nbr_cols = True
16031624
self.nbr_messages += 1
16041625
if self.nbr_messages <= self.max_nbr_messages:
@@ -1608,8 +1629,8 @@ def _parse_table(self, table: Tag) -> TableData:
16081629
break
16091630

16101631
range_ = [
1611-
tg_range["cell_offst"][namest - 1],
1612-
tg_range["cell_offst"][nameend - 1] - shift,
1632+
tg_range["cell_offst"][start - 1],
1633+
tg_range["cell_offst"][end - 1] - shift,
16131634
]
16141635

16151636
# add row and replicate cell if needed
@@ -1668,7 +1689,7 @@ def parse(self) -> Optional[TableData]:
16681689
A docling table data.
16691690
"""
16701691
section = self._soup.find("table")
1671-
if section is not None:
1692+
if isinstance(section, Tag):
16721693
table = self._parse_table(section)
16731694
if table.num_rows == 0 or table.num_cols == 0:
16741695
_log.warning("The parsed USPTO table is empty")

0 commit comments

Comments
 (0)