From 122466f15b89b31b3be5ecf9a742e149f8e5d3a8 Mon Sep 17 00:00:00 2001 From: Panos Vagenas Date: Tue, 18 Feb 2025 22:54:07 +0100 Subject: [PATCH] docs: revamp picture description example Signed-off-by: Panos Vagenas --- docs/examples/pictures_description.ipynb | 136 +++++++++++++++++++++++ docs/examples/pictures_description.py | 48 -------- 2 files changed, 136 insertions(+), 48 deletions(-) create mode 100644 docs/examples/pictures_description.ipynb delete mode 100644 docs/examples/pictures_description.py diff --git a/docs/examples/pictures_description.ipynb b/docs/examples/pictures_description.ipynb new file mode 100644 index 00000000..f906a7aa --- /dev/null +++ b/docs/examples/pictures_description.ipynb @@ -0,0 +1,136 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install -q docling ipython" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from docling.datamodel.base_models import InputFormat\n", + "from docling.datamodel.pipeline_options import ( # granite_picture_description,\n", + " PdfPipelineOptions,\n", + " smolvlm_picture_description,\n", + ")\n", + "from docling.document_converter import DocumentConverter, PdfFormatOption" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "DOC_SOURCE = \"https://arxiv.org/pdf/2206.01062\"\n", + "\n", + "pipeline_options = PdfPipelineOptions()\n", + "pipeline_options.do_picture_description = True\n", + "pipeline_options.picture_description_options = smolvlm_picture_description\n", + "# pipeline_options.picture_description_options = granite_picture_description\n", + "pipeline_options.picture_description_options.prompt = (\n", + " \"Describe the image in three sentences. Be consise and accurate.\"\n", + ")\n", + "pipeline_options.images_scale = 2.0\n", + "pipeline_options.generate_picture_images = True\n", + "\n", + "converter = DocumentConverter(\n", + " format_options={\n", + " InputFormat.PDF: PdfFormatOption(\n", + " pipeline_options=pipeline_options,\n", + " )\n", + " }\n", + ")\n", + "doc = converter.convert(DOC_SOURCE).document" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Picture #/pictures/0


Caption

Figure 1: Four examples of complex page layouts across different document categories

Annotations

[PictureDescriptionData(kind='description', text='An advertisement with a blue background, an image of a building, and text about the 175 years of looking forward.', provenance='HuggingFaceTB/SmolVLM-256M-Instruct')]\n", + "

Picture #/pictures/1


Caption

Figure 2: Distribution of DocLayNet pages across document categories.

Annotations

[PictureDescriptionData(kind='description', text='The image is a pie chart that represents the distribution of various categories. The chart is divided into four sections, each representing a different category. The categories are: Financial, Tenders, Laws, and Manuals. \\n\\n### Description of the Pie Chart:\\n1. **Financial Categories:**\\n - **Financial:** 32%\\n - **Tenders:** 6%\\n - **Laws:** 16%\\n - **Manuals:** 21%\\n\\n2. **Tenders:**\\n - **Tenders:** 16%\\n - **Laws:** 16%\\n - **Manuals:** 16%\\n\\n3. **Laws:**\\n - **Laws:** 16%\\n - **Manuals:** 16%\\n\\n4. **Manuals:**\\n - **Manuals:** 21%\\n\\n### Analysis:\\nThe pie chart is a visual representation of the distribution of', provenance='HuggingFaceTB/SmolVLM-256M-Instruct')]\n", + "

Picture #/pictures/2


Caption

Figure 3: Corpus Conversion Service annotation user interface. The PDF page is shown in the background, with overlaid text-cells (in darker shades). The annotation boxes can be drawn by dragging a rectangle over each segment with the respective label from the palette on the right.

Annotations

[PictureDescriptionData(kind='description', text='The image is a table that contains field labels and a list of fields. The table is titled \"Field Labels.\" The table has five columns and five rows. The first column is labeled \"Clusters,\" the second column is labeled \"Clusters,\" the third column is labeled \"Clusters,\" the fourth column is labeled \"Clusters,\" and the fifth column is labeled \"Clusters.\"\\n\\nThe table is structured in a way that it is easy to understand. The first row of the table contains the following fields:\\n\\n- \"Clusters\"\\n- \"Clusters\"\\n- \"Clusters\"\\n- \"Clusters\"\\n- \"Clusters\"\\n- \"Clusters\"\\n\\nThe second row of the table contains the following fields:\\n\\n- \"Clusters\"\\n- \"Clusters\"\\n- \"Clusters\"\\n- \"Clusters\"\\n- \"Clusters\"\\n- \"Clusters\"\\n\\nThe third row of the', provenance='HuggingFaceTB/SmolVLM-256M-Instruct')]\n", + "

Picture #/pictures/3


Caption

Figure 4: Examples of plausible annotation alternatives for the same page. Criteria in our annotation guideline can resolve cases A to C, while the case D remains ambiguous.

Annotations

[PictureDescriptionData(kind='description', text='Figure 1.', provenance='HuggingFaceTB/SmolVLM-256M-Instruct')]\n", + "

Picture #/pictures/4


Caption

Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNNnetworkwithResNet50backbonetrainedonincreasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.

Annotations

[PictureDescriptionData(kind='description', text='The image is a line graph that shows the percentage of DocLayNet training set as a percentage of the total training set. The x-axis represents the percentage of training set, ranging from 0 to 100. The y-axis represents the percentage of training set, ranging from 0 to 100. The graph shows a continuous trend of increasing training set percentage over time.\\n\\n### Description of the Graph:\\n1. **X-Axis (Percentage of Training Set):**\\n - The x-axis is labeled \"Percentage of DocLayNet training set.\"\\n - The range of the x-axis is from 0 to 100.\\n\\n2. **Y-Axis (Percentage of Training Set):**\\n - The y-axis is labeled \"MAP:0.500-0.95.\"\\n - The range of the y-axis is from 0 to 100.\\n\\n3.', provenance='HuggingFaceTB/SmolVLM-256M-Instruct')]\n" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from IPython import display\n", + "\n", + "html_buffer = []\n", + "# display the first 5 pictures and their captions and annotations:\n", + "for pic in doc.pictures[:5]:\n", + " html_buffer.append(\n", + " f\"

Picture {pic.self_ref}

\"\n", + " f'
'\n", + " f\"

Caption

{pic.caption_text(doc=doc)}
\"\n", + " f\"

Annotations

{pic.annotations}\\n\"\n", + " )\n", + "display.HTML(\"
\".join(html_buffer))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.8" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/examples/pictures_description.py b/docs/examples/pictures_description.py deleted file mode 100644 index f60ac29d..00000000 --- a/docs/examples/pictures_description.py +++ /dev/null @@ -1,48 +0,0 @@ -import logging -from pathlib import Path - -from docling_core.types.doc import PictureItem - -from docling.datamodel.base_models import InputFormat -from docling.datamodel.pipeline_options import ( - PdfPipelineOptions, - granite_picture_description, - smolvlm_picture_description, -) -from docling.document_converter import DocumentConverter, PdfFormatOption - - -def main(): - logging.basicConfig(level=logging.INFO) - - input_doc_path = Path("./tests/data/pdf/2206.01062.pdf") - - pipeline_options = PdfPipelineOptions() - pipeline_options.do_picture_description = True - pipeline_options.picture_description_options = smolvlm_picture_description - # pipeline_options.picture_description_options = granite_picture_description - - pipeline_options.picture_description_options.prompt = ( - "Describe the image in three sentences. Be consise and accurate." - ) - - doc_converter = DocumentConverter( - format_options={ - InputFormat.PDF: PdfFormatOption( - pipeline_options=pipeline_options, - ) - } - ) - result = doc_converter.convert(input_doc_path) - - for element, _level in result.document.iterate_items(): - if isinstance(element, PictureItem): - print( - f"Picture {element.self_ref}\n" - f"Caption: {element.caption_text(doc=result.document)}\n" - f"Annotations: {element.annotations}" - ) - - -if __name__ == "__main__": - main()