14
14
from enum import Enum , unique
15
15
from io import BytesIO
16
16
from pathlib import Path
17
- from typing import Any , Final , Optional , Union
17
+ from typing import Final , Optional , Union
18
18
19
19
from bs4 import BeautifulSoup , Tag
20
20
from docling_core .types .doc import (
@@ -1406,6 +1406,10 @@ class XmlTable:
1406
1406
http://oasis-open.org/specs/soextblx.dtd
1407
1407
"""
1408
1408
1409
+ class ColInfo (TypedDict ):
1410
+ ncols : int
1411
+ colinfo : list [dict ]
1412
+
1409
1413
class MinColInfoType (TypedDict ):
1410
1414
offset : list [int ]
1411
1415
colwidth : list [int ]
@@ -1425,7 +1429,7 @@ def __init__(self, input: str) -> None:
1425
1429
self .empty_text = ""
1426
1430
self ._soup = BeautifulSoup (input , features = "xml" )
1427
1431
1428
- def _create_tg_range (self , tgs : list [dict [ str , Any ] ]) -> dict [int , ColInfoType ]:
1432
+ def _create_tg_range (self , tgs : list [ColInfo ]) -> dict [int , ColInfoType ]:
1429
1433
"""Create a unified range along the table groups.
1430
1434
1431
1435
Args:
@@ -1532,19 +1536,26 @@ def _parse_table(self, table: Tag) -> TableData:
1532
1536
Returns:
1533
1537
A docling table object.
1534
1538
"""
1535
- tgs_align = []
1536
- tg_secs = table . find_all ("tgroup" )
1539
+ tgs_align : list [ XmlTable . ColInfo ] = []
1540
+ tg_secs = table ("tgroup" )
1537
1541
if tg_secs :
1538
1542
for tg_sec in tg_secs :
1539
- ncols = tg_sec .get ("cols" , None )
1540
- if ncols :
1541
- ncols = int (ncols )
1542
- tg_align = {"ncols" : ncols , "colinfo" : []}
1543
- cs_secs = tg_sec .find_all ("colspec" )
1543
+ if not isinstance (tg_sec , Tag ):
1544
+ continue
1545
+ col_val = tg_sec .get ("cols" )
1546
+ ncols = (
1547
+ int (col_val )
1548
+ if isinstance (col_val , str ) and col_val .isnumeric ()
1549
+ else 1
1550
+ )
1551
+ tg_align : XmlTable .ColInfo = {"ncols" : ncols , "colinfo" : []}
1552
+ cs_secs = tg_sec ("colspec" )
1544
1553
if cs_secs :
1545
1554
for cs_sec in cs_secs :
1546
- colname = cs_sec .get ("colname" , None )
1547
- colwidth = cs_sec .get ("colwidth" , None )
1555
+ if not isinstance (cs_sec , Tag ):
1556
+ continue
1557
+ colname = cs_sec .get ("colname" )
1558
+ colwidth = cs_sec .get ("colwidth" )
1548
1559
tg_align ["colinfo" ].append (
1549
1560
{"colname" : colname , "colwidth" : colwidth }
1550
1561
)
@@ -1565,40 +1576,50 @@ def _parse_table(self, table: Tag) -> TableData:
1565
1576
table_data : list [TableCell ] = []
1566
1577
i_row_global = 0
1567
1578
is_row_empty : bool = True
1568
- tg_secs = table . find_all ("tgroup" )
1579
+ tg_secs = table ("tgroup" )
1569
1580
if tg_secs :
1570
1581
for itg , tg_sec in enumerate (tg_secs ):
1582
+ if not isinstance (tg_sec , Tag ):
1583
+ continue
1571
1584
tg_range = tgs_range [itg ]
1572
- row_secs = tg_sec . find_all (["row" , "tr" ])
1585
+ row_secs = tg_sec (["row" , "tr" ])
1573
1586
1574
1587
if row_secs :
1575
1588
for row_sec in row_secs :
1576
- entry_secs = row_sec .find_all (["entry" , "td" ])
1577
- is_header : bool = row_sec .parent .name in ["thead" ]
1589
+ if not isinstance (row_sec , Tag ):
1590
+ continue
1591
+ entry_secs = row_sec (["entry" , "td" ])
1592
+ is_header : bool = (
1593
+ row_sec .parent is not None
1594
+ and row_sec .parent .name == "thead"
1595
+ )
1578
1596
1579
1597
ncols = 0
1580
1598
local_row : list [TableCell ] = []
1581
1599
is_row_empty = True
1582
1600
if entry_secs :
1583
1601
wrong_nbr_cols = False
1584
1602
for ientry , entry_sec in enumerate (entry_secs ):
1603
+ if not isinstance (entry_sec , Tag ):
1604
+ continue
1585
1605
text = entry_sec .get_text ().strip ()
1586
1606
1587
1607
# start-end
1588
- namest = entry_sec .attrs .get ("namest" , None )
1589
- nameend = entry_sec .attrs .get ("nameend" , None )
1590
- if isinstance (namest , str ) and namest .isnumeric ():
1591
- namest = int (namest )
1592
- else :
1593
- namest = ientry + 1
1608
+ namest = entry_sec .get ("namest" )
1609
+ nameend = entry_sec .get ("nameend" )
1610
+ start = (
1611
+ int (namest )
1612
+ if isinstance (namest , str ) and namest .isnumeric ()
1613
+ else ientry + 1
1614
+ )
1594
1615
if isinstance (nameend , str ) and nameend .isnumeric ():
1595
- nameend = int (nameend )
1616
+ end = int (nameend )
1596
1617
shift = 0
1597
1618
else :
1598
- nameend = ientry + 2
1619
+ end = ientry + 2
1599
1620
shift = 1
1600
1621
1601
- if nameend > len (tg_range ["cell_offst" ]):
1622
+ if end > len (tg_range ["cell_offst" ]):
1602
1623
wrong_nbr_cols = True
1603
1624
self .nbr_messages += 1
1604
1625
if self .nbr_messages <= self .max_nbr_messages :
@@ -1608,8 +1629,8 @@ def _parse_table(self, table: Tag) -> TableData:
1608
1629
break
1609
1630
1610
1631
range_ = [
1611
- tg_range ["cell_offst" ][namest - 1 ],
1612
- tg_range ["cell_offst" ][nameend - 1 ] - shift ,
1632
+ tg_range ["cell_offst" ][start - 1 ],
1633
+ tg_range ["cell_offst" ][end - 1 ] - shift ,
1613
1634
]
1614
1635
1615
1636
# add row and replicate cell if needed
@@ -1668,7 +1689,7 @@ def parse(self) -> Optional[TableData]:
1668
1689
A docling table data.
1669
1690
"""
1670
1691
section = self ._soup .find ("table" )
1671
- if section is not None :
1692
+ if isinstance ( section , Tag ) :
1672
1693
table = self ._parse_table (section )
1673
1694
if table .num_rows == 0 or table .num_cols == 0 :
1674
1695
_log .warning ("The parsed USPTO table is empty" )
0 commit comments