|
6 | 6 | from docling_core.types.doc import ( |
7 | 7 | BoundingBox, |
8 | 8 | CoordOrigin, |
| 9 | + DocItem, |
9 | 10 | DocItemLabel, |
10 | 11 | DoclingDocument, |
11 | 12 | DocumentOrigin, |
12 | 13 | GroupLabel, |
| 14 | + NodeItem, |
13 | 15 | ProvenanceItem, |
14 | 16 | RefItem, |
15 | 17 | TableData, |
|
24 | 26 | from pydantic import BaseModel, ConfigDict |
25 | 27 |
|
26 | 28 | from docling.datamodel.base_models import ( |
| 29 | + BasePageElement, |
27 | 30 | Cluster, |
28 | 31 | ContainerElement, |
29 | 32 | FigureElement, |
@@ -80,6 +83,35 @@ def _assembled_to_readingorder_elements( |
80 | 83 |
|
81 | 84 | return elements |
82 | 85 |
|
| 86 | + def _add_child_elements( |
| 87 | + self, element: BasePageElement, doc_item: NodeItem, doc: DoclingDocument |
| 88 | + ): |
| 89 | + |
| 90 | + child: Cluster |
| 91 | + for child in element.cluster.children: |
| 92 | + c_label = child.label |
| 93 | + c_bbox = child.bbox.to_bottom_left_origin( |
| 94 | + doc.pages[element.page_no].size.height |
| 95 | + ) |
| 96 | + c_text = " ".join( |
| 97 | + [ |
| 98 | + cell.text.replace("\x02", "-").strip() |
| 99 | + for cell in child.cells |
| 100 | + if len(cell.text.strip()) > 0 |
| 101 | + ] |
| 102 | + ) |
| 103 | + |
| 104 | + c_prov = ProvenanceItem( |
| 105 | + page_no=element.page_no, charspan=(0, len(c_text)), bbox=c_bbox |
| 106 | + ) |
| 107 | + if c_label == DocItemLabel.LIST_ITEM: |
| 108 | + # TODO: Infer if this is a numbered or a bullet list item |
| 109 | + doc.add_list_item(parent=doc_item, text=c_text, prov=c_prov) |
| 110 | + elif c_label == DocItemLabel.SECTION_HEADER: |
| 111 | + doc.add_heading(parent=doc_item, text=c_text, prov=c_prov) |
| 112 | + else: |
| 113 | + doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov) |
| 114 | + |
83 | 115 | def _readingorder_elements_to_docling_doc( |
84 | 116 | self, |
85 | 117 | conv_res: ConversionResult, |
@@ -123,8 +155,6 @@ def _readingorder_elements_to_docling_doc( |
123 | 155 | for cid in lst |
124 | 156 | } |
125 | 157 |
|
126 | | - # TODO: handle merges |
127 | | - |
128 | 158 | for rel in ro_elements: |
129 | 159 | if rel.cid in skippable_cids: |
130 | 160 | continue |
@@ -181,7 +211,7 @@ def _readingorder_elements_to_docling_doc( |
181 | 211 |
|
182 | 212 | tbl.footnotes.append(new_footnote_item.get_ref()) |
183 | 213 |
|
184 | | - # TODO: handle element.cluster.children. |
| 214 | + # TODO: Consider adding children of Table. |
185 | 215 |
|
186 | 216 | elif isinstance(element, FigureElement): |
187 | 217 | cap_text = "" |
@@ -210,12 +240,19 @@ def _readingorder_elements_to_docling_doc( |
210 | 240 |
|
211 | 241 | pic.footnotes.append(new_footnote_item.get_ref()) |
212 | 242 |
|
213 | | - # TODO: handle element.cluster.children. |
214 | | - # _add_child_elements(pic, doc, obj, pelem) |
| 243 | + self._add_child_elements(element, pic, out_doc) |
215 | 244 |
|
216 | 245 | elif isinstance(element, ContainerElement): # Form, KV region |
217 | | - pass |
218 | | - # TODO: handle element.cluster.children. |
| 246 | + label = element.label |
| 247 | + group_label = GroupLabel.UNSPECIFIED |
| 248 | + if label == DocItemLabel.FORM: |
| 249 | + group_label = GroupLabel.FORM_AREA |
| 250 | + elif label == DocItemLabel.KEY_VALUE_REGION: |
| 251 | + group_label = GroupLabel.KEY_VALUE_AREA |
| 252 | + |
| 253 | + container_el = out_doc.add_group(label=group_label) |
| 254 | + |
| 255 | + self._add_child_elements(element, container_el, out_doc) |
219 | 256 |
|
220 | 257 | return out_doc |
221 | 258 |
|
@@ -284,7 +321,7 @@ def _merge_elements(self, element, merged_elem, new_item, page_height): |
284 | 321 | bbox=element.cluster.bbox.to_bottom_left_origin(page_height), |
285 | 322 | ) |
286 | 323 | new_item.text += f" {merged_elem.text}" |
287 | | - new_item.orig += f" {merged_elem.text}" # TODO: This is incomplete. |
| 324 | + new_item.orig += f" {merged_elem.text}" # TODO: This is incomplete, we don't have the `orig` field of the merged element. |
288 | 325 | new_item.prov.append(prov) |
289 | 326 |
|
290 | 327 | def __call__(self, conv_res: ConversionResult) -> DoclingDocument: |
|
0 commit comments