From 8cffff327ed0f193b7a59ba937c5a4e6f5fade0a Mon Sep 17 00:00:00 2001
From: crjaensch <crjaensch@users.noreply.github.com>
Date: Sat, 15 Mar 2025 17:18:46 +0100
Subject: [PATCH] Add PDF extraction example agent with verification

- Create dedicated extract_doc folder for document processing examples
- Implement PDF extraction agent using OpenAI responses API with file input support
- Add sample document generator with ReportLab for testing
- Include verification step to ensure extracted data is grounded in source document
- Support structured data extraction with customizable schema
- Handle JSON parsing from various response formats
---
 examples/extract_doc/pdf_extraction_agent.py | 243 +++++++++++++++++++
 examples/extract_doc/sample_document.pdf     | Bin 0 -> 4865 bytes
 examples/extract_doc/sample_document.py      | 164 +++++++++++++
 3 files changed, 407 insertions(+)
 create mode 100644 examples/extract_doc/pdf_extraction_agent.py
 create mode 100644 examples/extract_doc/sample_document.pdf
 create mode 100644 examples/extract_doc/sample_document.py

diff --git a/examples/extract_doc/pdf_extraction_agent.py b/examples/extract_doc/pdf_extraction_agent.py
new file mode 100644
index 00000000..f7a43e5f
--- /dev/null
+++ b/examples/extract_doc/pdf_extraction_agent.py
@@ -0,0 +1,243 @@
+#!/usr/bin/env python3
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "openai>=1.12.0",
+#     "openai-agents",
+# ]
+# ///
+
+"""
+Example of creating an Agent that extracts information from a PDF document
+using the input_file content option of the OpenAI responses API.
+"""
+
+import asyncio
+import base64
+import json
+import os
+import sys
+from typing import Any, Dict, List
+
+try:
+    from agents import Agent, Runner, set_default_openai_api
+except ImportError:
+    print("Required packages not found. Please run this script with uv:")
+    print("uv run examples/extract_doc/pdf_extraction_agent.py")
+    sys.exit(1)
+
+
+async def extract_data_from_pdf(agent: Agent, pdf_path: str) -> Dict[str, Any]:
+    """
+    Extract structured data from a PDF document using the OpenAI responses API.
+    
+    Args:
+        agent: The agent to use for extraction
+        pdf_path: Path to the PDF file
+        
+    Returns:
+        Extracted structured data from the PDF
+    """
+    # Read the PDF file and encode it as base64
+    with open(pdf_path, "rb") as f:
+        pdf_data = f.read()
+    
+    pdf_base64 = base64.b64encode(pdf_data).decode("utf-8")
+    pdf_name = os.path.basename(pdf_path)
+    
+    # Define the extraction schema - modify this based on what you want to extract
+    extraction_schema = {
+        "title": "string",
+        "authors": ["string"],
+        "publication_date": "string",
+        "abstract": "string",
+        "sections": [
+            {
+                "heading": "string",
+                "content": "string"
+            }
+        ],
+        "tables": [
+            {
+                "caption": "string",
+                "data": [["string"]]
+            }
+        ],
+        "figures": [
+            {
+                "caption": "string",
+                "description": "string"
+            }
+        ],
+        "references": ["string"]
+    }
+    
+    # Create the input with the PDF file
+    input_with_pdf = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "input_text",
+                    "text": (
+                        "Extract the following information from the PDF document in a structured format:\n"
+                        f"{json.dumps(extraction_schema, indent=2)}\n\n"
+                        "Return the extracted data as a JSON object that follows this schema exactly."
+                    )
+                },
+                {
+                    "type": "input_file",
+                    "filename": pdf_name,
+                    "file_data": f"data:application/pdf;base64,{pdf_base64}"
+                }
+            ]
+        }
+    ]
+    
+    # Run the agent with the PDF input
+    result = await Runner.run(agent, input=input_with_pdf)
+    
+    # Extract the JSON response
+    response_text = result.final_output
+    
+    # Parse the JSON from the response text
+    # This handles cases where the model might include markdown code blocks
+    json_str = extract_json_from_text(response_text)
+    
+    try:
+        extracted_data = json.loads(json_str)
+        return extracted_data
+    except json.JSONDecodeError:
+        print("Failed to parse JSON response. Raw response:")
+        print(response_text)
+        return {"error": "Failed to parse response"}
+
+
+def extract_json_from_text(text: str) -> str:
+    """
+    Extract JSON string from text that might contain markdown or other formatting.
+    """
+    # Check if the text contains a code block
+    if "```json" in text:
+        # Extract content between ```json and ```
+        start = text.find("```json") + 7
+        end = text.find("```", start)
+        return text[start:end].strip()
+    elif "```" in text:
+        # Extract content between ``` and ```
+        start = text.find("```") + 3
+        end = text.find("```", start)
+        return text[start:end].strip()
+    
+    # If no code block, try to find JSON object directly
+    # Look for the first { and the last }
+    start = text.find("{")
+    end = text.rfind("}") + 1
+    
+    if start >= 0 and end > start:
+        return text[start:end].strip()
+    
+    # If all else fails, return the original text
+    return text
+
+
+# Add a verification function to check if the extraction was successful
+async def verify_extraction(agent: Agent, pdf_path: str, extracted_data: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Verify if the extracted data is grounded in the PDF content.
+    
+    Args:
+        agent: The agent to use for verification
+        pdf_path: Path to the PDF file
+        extracted_data: The extracted data to verify
+        
+    Returns:
+        Verification results
+    """
+    # Read the PDF file and encode it as base64
+    with open(pdf_path, "rb") as f:
+        pdf_data = f.read()
+    
+    pdf_base64 = base64.b64encode(pdf_data).decode("utf-8")
+    pdf_name = os.path.basename(pdf_path)
+    
+    # Create the input with the PDF file and extracted data
+    input_with_pdf = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "input_text",
+                    "text": (
+                        "Check if the following extracted data is grounded in the PDF content:\n\n"
+                        f"Extracted data:\n{json.dumps(extracted_data, indent=2)}\n\n"
+                        "Return a JSON object with the following structure:\n"
+                        "{ \"is_grounded\": boolean, \"ungrounded_items\": [{ \"path\": \"path.to.item\", \"value\": \"extracted value\", \"issue\": \"description of issue\" }] }"
+                    )
+                },
+                {
+                    "type": "input_file",
+                    "filename": pdf_name,
+                    "file_data": f"data:application/pdf;base64,{pdf_base64}"
+                }
+            ]
+        }
+    ]
+    
+    # Run the agent with the PDF input
+    result = await Runner.run(agent, input=input_with_pdf)
+    
+    # Extract the JSON response
+    response_text = result.final_output
+    json_str = extract_json_from_text(response_text)
+    
+    try:
+        verification_result = json.loads(json_str)
+        return verification_result
+    except json.JSONDecodeError:
+        print("Failed to parse verification JSON. Raw response:")
+        print(response_text)
+        return {"error": "Failed to parse verification response"}
+
+
+# Example usage with verification
+async def extract_and_verify():
+    # Set up the agent
+    set_default_openai_api("responses")
+    openai_api_key = os.environ.get("OPENAI_API_KEY")
+    if not openai_api_key:
+        raise ValueError("Please set the OPENAI_API_KEY environment variable")
+    
+    # Use the sample document created by the other script
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    pdf_path = os.path.join(current_dir, "sample_document.pdf")
+    
+    if not os.path.exists(pdf_path):
+        print(f"Sample PDF not found at {pdf_path}")
+        print("Please run the sample_document.py script first:")
+        print("uv run examples/extract_doc/sample_document.py")
+        return None, None
+    
+    pdf_agent = Agent(
+        name="PDF Processing Agent",
+        instructions="An agent that extracts and verifies information from PDF documents.",
+        model="gpt-4o",
+    )
+    
+    # Extract data
+    print("Extracting data from PDF...")
+    extracted_data = await extract_data_from_pdf(pdf_agent, pdf_path)
+    print("Extracted data:")
+    print(json.dumps(extracted_data, indent=2))
+    
+    # Verify extraction
+    print("\nVerifying extraction...")
+    verification = await verify_extraction(pdf_agent, pdf_path, extracted_data)
+    print("Verification results:")
+    print(json.dumps(verification, indent=2))
+    
+    return extracted_data, verification
+
+
+if __name__ == "__main__":
+    asyncio.run(extract_and_verify())
diff --git a/examples/extract_doc/sample_document.pdf b/examples/extract_doc/sample_document.pdf
new file mode 100644
index 0000000000000000000000000000000000000000..b7e094152d7843573161c04c641fef91d0d91744
GIT binary patch
literal 4865
zcmdT|$-=5gvflqsfeI=pqoN>=fFgnlIN+>^I3gfoSJhqI7r1xV`+nBOzs{+1`c!vU
z^|Dz=W*8Zf5%Ce35s)T!P!cPCK!5%J|N9$YIDsEVs<j6s$8$m}at=Vu0UUh$b$7f7
zI7d-X!SSDd`Uwj^d0lId+5Y{51PH+2Uq7nV54=MHR6I2Ryh8znFQWobe2+fr^$*88
zyjkBoUzhOK)9LuJ0=(<Ee{rIVZ2@@Ov;D)xI{~<P@mk*V@{KUC?oKS81n|*oAN>-<
zlX3hT&)={3pIFv+s@|LUH>yzHcMpF?AN42oy-ocieZ#$bdA#DiezWRdnOXRgYU%IP
z3b6;`h&=(4h^cWb`p)5E@&3=iGM*DaiKKusO95*D*R1dzTlQBz!+H9z&~~1{XDr`A
zc?UO^poP9Ya-vuO{ssj;c7BcnCf}`-!+&`TU*0mXj@=L7h9AWsikIW#(DFV-zUd#p
z*Fb6U;_m`gidnxp?wbW)^#4~_Q2x$>w{d;{5XX4(bOyc`58kBbS6BavRl$lZ*FSyd
z1?D>$tygsRLm<1#TAuIya`#`)6$}Ho5IR<L@jdbF->g`n2#O{OBu$n{nkwfAGD8rV
zuYA7>czMsju`kDqbNCdYIKTYlhleMSjrrnz{*kCS{)d<C#GCn+C*D9%FZ<96e*sY7
zcX<K+40ZgCUL8An8}Z{F#+S&A_y41HF<OEkPCj_Ya-ZJ=<PFkq0C~mWjK4|c5*x7u
z-taeoH-wBgtbg_Hr^CPV?dRw4RYK?DMzK9Hpj#tBmKj=f;*|5wA8zc{FOyi5>UgK<
z43Om_`4;tzV&~sKBrB@rL;@yr=`O=yi^R0`I9BDF+<2Lnt3Cs_`^mlC*`3a)15SjV
z5caD`HzD6h2olaPHENZN$7+W7%>E2*6V2dtYLJI>W#lB>-1^=dZmw+433xg0K)Y#u
zGb8j>1h4Wl%162eS89zU!Rp)X<btbnCM%ZKt0Z+(Z8bl&ncy(&*w1k~nZ6MqxoO;e
zL{!IXB4X#_Zjenr+>Yj@9o<f^q+8o4HuUQ%*`T!roI@0du(oBPuFZ)-tFh!0jhUc;
zV6V{YB{p(?pJ^WP_GH3k?dKAP2XF;XG%@4;GEj4QzZMVjO%vQDzC%jS!-i{IZ<8QV
zx^;x6incEKaN90p4OqxDa>J+eFp`<rn~0eZde6QN>W?&b>!Y<RDe}`>Az`-I#sZd;
zIwPa>ElZDF7$ift?i<9beT;fCRcbS8K-G%8HLX7JL_@Vxg14c->XeMKSuiE1v#c{X
zPPBl?>8Y8(D$D-!fDadJZ<AcPCdneN9=|WI%V+_*xOio=uNBS~LU5N-x0kziaCpu&
z_d${+GnG3yC&Kyy^Y%QrJWmb!?(+BgK<8-V3ZWxB_JxFki0D>JR{26dlzeDS?%7rH
z=Ar%eh@5f=?eyzvI^Q?iYkOfRd1|248p^9MxmF=gzNnQ&Imot02tkEQ^Jp)wQ+gOK
zLJ}FWY@~7I9$Gl-dbr~ggI)3<Gw0PjLqwo4yeu`b&e?-#HHp$Dc$pGtH;9Ij{kS2}
zsXwg@w@A3mmnR2$SvX&0E}2+W^IUtT2fYJHrx}yP3LCC93IwQ7@twrTmR8~Q^%&5t
zLZ2fl(^O_BZj9cG;;+y?JR<du5U{t~_0?~u`Uy@hHnvteqmSKvwv@cPqIJD5ju7)8
zbFkAV_r>e*ETl*@>j(8y+Xi#tyqbiU>4}`roEAA6?$dqT?4{ds5Yopnp+CTlqqCdp
zWy{sY_Ea>PyaM;FB0pi3^{$f;#TpEz@4NLTRjD}pV_4No{qnJYMGD99GP@f$7VVYR
z%xyhM7Rx)&x?i~xo&XbByk9Pw5_%}zxY>&*1IC&T9M-m2hI^n5&Z%WVS2mVx!$Y20
zUo;!I7gn#He!8WxHkw|x%-YDSdk7_THd8v({c3?obgBIbvJ=~+J5;W(wOrb$`EVX~
zAfzIqo<L#QLmwMni{1{22HDlEWuJs>S7g%>uYnh^lM%dTSrYtWqc;v5*o-R2S(htY
z{od}_%VsEf&$;XK=7B>at9~mgQ$K&SGi<GA5~?&vnXuR@)QFieCD;;nn>kF=j~?VY
z=eUqGk-{D5I|I$owIZ(|{9&D6*&)2lqQN;wEB9_R)gGI~esjuC%cQHGH#tc$*Wu6{
znS%_3u8t!N?qzzHs(Lquh<%gQ*~V~|lq6}BVW~<p(4%)sbLu&H7ley8O%3(VnSz2>
zEubJrUWZcarr3EaeFe|G1c%)-nKF{8V5oJvw~JjFE7VJ!g8)qzW4Yek!K0z~Ou*Dm
zn4gvfWHF6Qyk_sybWiB#7OhQkFj6v?TraG>n3>+g`gzV~H(16&2!2hLh;>S7O3q|7
zr2FAj3GT;Lvt%OqksoOL%-L1$^U7u&rv({Ni%y}af;eV9kUGZ)rKPw%8$<Gi8};x)
z#djV@yrQFvw5|@D<C%Y;X$XV46DXU^yi*)37OcT<yzGKFR{br)1unC$_1uwj^Qtdu
zY0kCn8ZL%30`k%pA86N*Ena&YW(I95ZDrT837wp2-5xY=zsw;8K9MNFB*}{WKvV6z
zk_zgNyhiEcu#|>EVZ1FRbH^?{H=J6?4??m$zfHkAzG#+%GO8j<3Dz?YqC9;JwbwS|
z;Y2AvUhRlZ*RV^Xx7gc*&Frb;ZZd1ND-t9&-C_NI>+z-h%dz(5dQSesmHt0GC@G@+
z!$Ao*7PIFPN_HpU=6M=#tiy0V96Xq5zy~Jw6m{&xC<VWp);o%GEPKXtRp~x*st!Z#
z<GLbDJGJp>28CwAJWTZ3tZKe^&s0_B-brJaSV=R@IT)H~fGw6Awnjodd)fDeO47v!
z*#1VOv=-Vwbx@QjZX;tnkFuA>%%ruD@h4Tna-&p!blmDiZ8(Pf=BNU3yY0NQELP6g
zvUDP}8|}<DyXKCe7dovj?)8UMYawdWs&&nZp4?1M>MA^S<>)l1!uuW}R@kJ_=JWY&
zA)j{+FSR(~EA@75e{+g3uc`MWgbeLg$PM{zV$ML2W;BUoyFJ+xcxPHT5c_PaUavhQ
z12$ESKRhR6*{HRH@Uerf6N^gcIXY)Dl((22t3^W)N|;+pzp~i|Io?QjIw%DD_TGBg
zou`p+sftpY+PN%Vm*D<Ys;w`DRTorMD$O?0r99}pEU$TNH#pE-<Kpr_-4p_`+zhnp
zsFv6o7mYNld#z<b?`=g;FjyxyTa+R#TvyZEEU0E-b(L**y!Lnl)|9nxmbb#30u#t9
zu;p|<A1qTddy<pU6@0Y!PiVFl3#HK|&(+maz1QayNQApZ)V*uA5}500#W-Sz0VrCg
zvrcHasJ^V5tEhGY6@JH$wd!~v7e?Y~NJo<C-;-JC+0EaJ(AYMyZSoL0^PT0^yU*KQ
zh+5U@U6ah4MYwrbNzIGm?K*H%i}`GasS|PBg=&;hp78n&cL>^(AJQbh%q%kV-CSq7
za;>GoTarW<ejD2_M$q-qeT+ElqKweR3~5boEp7_07J4Qbs<-KSl*-i*xp8utadAJ{
zM)Gb<HjGrM<SQ=D*DHR@4$gahV(ZwdHcb<gZdH0{%#n%KlNTmg0jy+JRM42gRpH67
z&1Yy~l}duEsEoz7svAbW3$tWz-b;I}DmuKO(>bSCcBd#8mE1vLQi>XqxR*|@Q9k$Q
zw!$=ZHLO(#h}(eMi(kEQiB^7o7}Y1iCaPonaC^9%w0=rkYxKM{x$b-$Pttf_&Zxqb
z+g5kB=YvZ(971gVf~=+KEu0Oy2vJzHRy^3UBqo@SI|UJ%1^Frst`_xCpGD6&E}u`G
z_%>z_A--b3ZgO+->)8U)wf3!ocl4W2a?q#M1|C-S3a6>%qM>$cnzRj4&cRx}=&KVA
z_O)4B>Z7m3fRZuewt;IxB^4ifIK3;bGF<YNarlCKz`N~^v@BzcA`!a(;t#JnhvM)A
zB^4=pgcEb8cjFsC2j1N`{B0r>_z0WvXqFht)3vU9<Gj@y8QQZjG&UEl6JLS2#<=-U
zQsU2{bNnCxlKLQidw>#67ii!J{6;Is-G=uAc)!s|D((pUK%)wAg8hM3DE@wJu@pB-
ze&8$7ah3A}jiKYR{s$UM#--#x(nyN<onDeE{Z3wiVB=cr>)I%^F0K=Pws&L^_*hm8
zR*@=_d)8r{qf;grhGp%;{!lJjWvXn`RQ-=R)*sM!9jDVj?>M0-5F^xi#?|A$YCgdq
vAGjQ23V7Ge#z)+A@FU<|hsWd5<$s)az&@)sD~vuvSS+y22L!c;osYi*V&i$*

literal 0
HcmV?d00001

diff --git a/examples/extract_doc/sample_document.py b/examples/extract_doc/sample_document.py
new file mode 100644
index 00000000..9118219b
--- /dev/null
+++ b/examples/extract_doc/sample_document.py
@@ -0,0 +1,164 @@
+#!/usr/bin/env python3
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#     "reportlab>=4.0.0",
+# ]
+# ///
+
+"""
+Script to generate a sample PDF document for testing the PDF extraction agent.
+"""
+
+import os
+import sys
+
+try:
+    from reportlab.lib.pagesizes import letter
+    from reportlab.lib import colors
+    from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
+    from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
+except ImportError:
+    print("Required packages not found. Please run this script with uv:")
+    print("uv run examples/extract_doc/sample_document.py")
+    sys.exit(1)
+
+
+def create_sample_pdf(output_path):
+    """
+    Create a sample PDF document with structured content for testing extraction.
+    """
+    doc = SimpleDocTemplate(output_path, pagesize=letter)
+    styles = getSampleStyleSheet()
+    
+    # Create custom styles
+    title_style = styles["Title"]
+    heading_style = styles["Heading1"]
+    normal_style = styles["Normal"]
+    
+    # Create the content
+    content = []
+    
+    # Title
+    content.append(Paragraph("Research on Machine Learning Applications in Healthcare", title_style))
+    content.append(Spacer(1, 12))
+    
+    # Authors
+    content.append(Paragraph("Authors: Jane Smith, John Doe, Alice Johnson", styles["Heading3"]))
+    content.append(Spacer(1, 12))
+    
+    # Publication Date
+    content.append(Paragraph("Publication Date: March 15, 2025", styles["Heading3"]))
+    content.append(Spacer(1, 24))
+    
+    # Abstract
+    content.append(Paragraph("Abstract", heading_style))
+    content.append(Paragraph(
+        "This paper explores the applications of machine learning in healthcare, "
+        "focusing on diagnostic tools, treatment optimization, and patient monitoring systems. "
+        "We review recent advancements and discuss challenges and opportunities in this rapidly evolving field.",
+        normal_style
+    ))
+    content.append(Spacer(1, 12))
+    
+    # Introduction
+    content.append(Paragraph("1. Introduction", heading_style))
+    content.append(Paragraph(
+        "Machine learning has transformed healthcare in recent years, enabling more accurate "
+        "diagnoses, personalized treatment plans, and efficient resource allocation. "
+        "This paper provides an overview of current applications and future directions.",
+        normal_style
+    ))
+    content.append(Spacer(1, 12))
+    
+    # Methods
+    content.append(Paragraph("2. Methods", heading_style))
+    content.append(Paragraph(
+        "We conducted a systematic review of literature published between 2020 and 2025, "
+        "focusing on peer-reviewed articles describing machine learning applications in clinical settings. "
+        "Our analysis included both supervised and unsupervised learning approaches.",
+        normal_style
+    ))
+    content.append(Spacer(1, 12))
+    
+    # Results
+    content.append(Paragraph("3. Results", heading_style))
+    content.append(Paragraph(
+        "Our analysis identified three primary areas where machine learning has made significant impacts: "
+        "diagnostic assistance, treatment optimization, and patient monitoring. Each area shows promising "
+        "results but faces unique implementation challenges.",
+        normal_style
+    ))
+    content.append(Spacer(1, 12))
+    
+    # Table: ML Applications
+    content.append(Paragraph("Table 1: Machine Learning Applications in Healthcare", styles["Heading3"]))
+    
+    table_data = [
+        ['Application Area', 'ML Techniques', 'Accuracy Range', 'Implementation Status'],
+        ['Diagnostic Imaging', 'CNNs, Transfer Learning', '85-95%', 'Clinical Use'],
+        ['Treatment Planning', 'Reinforcement Learning, GBMs', '75-88%', 'Clinical Trials'],
+        ['Patient Monitoring', 'RNNs, LSTMs', '82-91%', 'Early Adoption'],
+        ['Drug Discovery', 'GANs, Autoencoders', '70-85%', 'Research Phase']
+    ]
+    
+    table = Table(table_data, colWidths=[120, 120, 100, 120])
+    table.setStyle(TableStyle([
+        ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
+        ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
+        ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
+        ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
+        ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
+        ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
+        ('GRID', (0, 0), (-1, -1), 1, colors.black)
+    ]))
+    
+    content.append(table)
+    content.append(Spacer(1, 12))
+    
+    # Discussion
+    content.append(Paragraph("4. Discussion", heading_style))
+    content.append(Paragraph(
+        "While machine learning shows great promise in healthcare, several challenges remain. "
+        "These include data privacy concerns, model interpretability, regulatory approval processes, "
+        "and integration with existing clinical workflows. Future research should address these challenges "
+        "while expanding applications to underserved areas of medicine.",
+        normal_style
+    ))
+    content.append(Spacer(1, 12))
+    
+    # Conclusion
+    content.append(Paragraph("5. Conclusion", heading_style))
+    content.append(Paragraph(
+        "Machine learning continues to revolutionize healthcare by improving diagnostic accuracy, "
+        "treatment efficacy, and patient outcomes. As technology advances and more data becomes "
+        "available, we expect to see broader adoption and more sophisticated applications in clinical practice.",
+        normal_style
+    ))
+    content.append(Spacer(1, 12))
+    
+    # References
+    content.append(Paragraph("References", heading_style))
+    references = [
+        "Smith, J. et al. (2023). Deep Learning for Medical Image Analysis. Journal of AI in Medicine, 45(2), 112-128.",
+        "Doe, J. & Johnson, A. (2024). Reinforcement Learning for Treatment Optimization. Healthcare Informatics Review, 18(3), 89-103.",
+        "Chen, X. et al. (2022). Patient Monitoring Systems Using Recurrent Neural Networks. IEEE Transactions on Medical Systems, 41(4), 215-230.",
+        "Williams, R. & Brown, T. (2025). Ethical Considerations in Healthcare AI. Bioethics Today, 12(1), 45-62.",
+        "Garcia, M. et al. (2021). Generative Models for Drug Discovery. Nature Machine Intelligence, 3(5), 375-390."
+    ]
+    
+    for ref in references:
+        content.append(Paragraph(ref, normal_style))
+        content.append(Spacer(1, 6))
+    
+    # Build the PDF
+    doc.build(content)
+    print(f"Sample PDF created at: {output_path}")
+
+
+if __name__ == "__main__":
+    # Create the examples directory if it doesn't exist
+    output_dir = os.path.dirname(os.path.abspath(__file__))
+    output_path = os.path.join(output_dir, "sample_document.pdf")
+    
+    create_sample_pdf(output_path)