|
6 | 6 | from docling_core.types.doc import (
|
7 | 7 | BoundingBox,
|
8 | 8 | CoordOrigin,
|
| 9 | + DocItem, |
9 | 10 | DocItemLabel,
|
10 | 11 | DoclingDocument,
|
11 | 12 | DocumentOrigin,
|
12 | 13 | GroupLabel,
|
| 14 | + NodeItem, |
13 | 15 | ProvenanceItem,
|
14 | 16 | RefItem,
|
15 | 17 | TableData,
|
|
24 | 26 | from pydantic import BaseModel, ConfigDict
|
25 | 27 |
|
26 | 28 | from docling.datamodel.base_models import (
|
| 29 | + BasePageElement, |
27 | 30 | Cluster,
|
28 | 31 | ContainerElement,
|
29 | 32 | FigureElement,
|
@@ -80,6 +83,35 @@ def _assembled_to_readingorder_elements(
|
80 | 83 |
|
81 | 84 | return elements
|
82 | 85 |
|
| 86 | + def _add_child_elements( |
| 87 | + self, element: BasePageElement, doc_item: NodeItem, doc: DoclingDocument |
| 88 | + ): |
| 89 | + |
| 90 | + child: Cluster |
| 91 | + for child in element.cluster.children: |
| 92 | + c_label = child.label |
| 93 | + c_bbox = child.bbox.to_bottom_left_origin( |
| 94 | + doc.pages[element.page_no].size.height |
| 95 | + ) |
| 96 | + c_text = " ".join( |
| 97 | + [ |
| 98 | + cell.text.replace("\x02", "-").strip() |
| 99 | + for cell in child.cells |
| 100 | + if len(cell.text.strip()) > 0 |
| 101 | + ] |
| 102 | + ) |
| 103 | + |
| 104 | + c_prov = ProvenanceItem( |
| 105 | + page_no=element.page_no, charspan=(0, len(c_text)), bbox=c_bbox |
| 106 | + ) |
| 107 | + if c_label == DocItemLabel.LIST_ITEM: |
| 108 | + # TODO: Infer if this is a numbered or a bullet list item |
| 109 | + doc.add_list_item(parent=doc_item, text=c_text, prov=c_prov) |
| 110 | + elif c_label == DocItemLabel.SECTION_HEADER: |
| 111 | + doc.add_heading(parent=doc_item, text=c_text, prov=c_prov) |
| 112 | + else: |
| 113 | + doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov) |
| 114 | + |
83 | 115 | def _readingorder_elements_to_docling_doc(
|
84 | 116 | self,
|
85 | 117 | conv_res: ConversionResult,
|
@@ -123,8 +155,6 @@ def _readingorder_elements_to_docling_doc(
|
123 | 155 | for cid in lst
|
124 | 156 | }
|
125 | 157 |
|
126 |
| - # TODO: handle merges |
127 |
| - |
128 | 158 | for rel in ro_elements:
|
129 | 159 | if rel.cid in skippable_cids:
|
130 | 160 | continue
|
@@ -181,7 +211,7 @@ def _readingorder_elements_to_docling_doc(
|
181 | 211 |
|
182 | 212 | tbl.footnotes.append(new_footnote_item.get_ref())
|
183 | 213 |
|
184 |
| - # TODO: handle element.cluster.children. |
| 214 | + # TODO: Consider adding children of Table. |
185 | 215 |
|
186 | 216 | elif isinstance(element, FigureElement):
|
187 | 217 | cap_text = ""
|
@@ -210,12 +240,19 @@ def _readingorder_elements_to_docling_doc(
|
210 | 240 |
|
211 | 241 | pic.footnotes.append(new_footnote_item.get_ref())
|
212 | 242 |
|
213 |
| - # TODO: handle element.cluster.children. |
214 |
| - # _add_child_elements(pic, doc, obj, pelem) |
| 243 | + self._add_child_elements(element, pic, out_doc) |
215 | 244 |
|
216 | 245 | elif isinstance(element, ContainerElement): # Form, KV region
|
217 |
| - pass |
218 |
| - # TODO: handle element.cluster.children. |
| 246 | + label = element.label |
| 247 | + group_label = GroupLabel.UNSPECIFIED |
| 248 | + if label == DocItemLabel.FORM: |
| 249 | + group_label = GroupLabel.FORM_AREA |
| 250 | + elif label == DocItemLabel.KEY_VALUE_REGION: |
| 251 | + group_label = GroupLabel.KEY_VALUE_AREA |
| 252 | + |
| 253 | + container_el = out_doc.add_group(label=group_label) |
| 254 | + |
| 255 | + self._add_child_elements(element, container_el, out_doc) |
219 | 256 |
|
220 | 257 | return out_doc
|
221 | 258 |
|
@@ -284,7 +321,7 @@ def _merge_elements(self, element, merged_elem, new_item, page_height):
|
284 | 321 | bbox=element.cluster.bbox.to_bottom_left_origin(page_height),
|
285 | 322 | )
|
286 | 323 | new_item.text += f" {merged_elem.text}"
|
287 |
| - new_item.orig += f" {merged_elem.text}" # TODO: This is incomplete. |
| 324 | + new_item.orig += f" {merged_elem.text}" # TODO: This is incomplete, we don't have the `orig` field of the merged element. |
288 | 325 | new_item.prov.append(prov)
|
289 | 326 |
|
290 | 327 | def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
|
|
0 commit comments