diff --git a/examples/extract_doc/pdf_extraction_agent.py b/examples/extract_doc/pdf_extraction_agent.py new file mode 100644 index 00000000..f7a43e5f --- /dev/null +++ b/examples/extract_doc/pdf_extraction_agent.py @@ -0,0 +1,243 @@ +#!/usr/bin/env python3 +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "openai>=1.12.0", +# "openai-agents", +# ] +# /// + +""" +Example of creating an Agent that extracts information from a PDF document +using the input_file content option of the OpenAI responses API. +""" + +import asyncio +import base64 +import json +import os +import sys +from typing import Any, Dict, List + +try: + from agents import Agent, Runner, set_default_openai_api +except ImportError: + print("Required packages not found. Please run this script with uv:") + print("uv run examples/extract_doc/pdf_extraction_agent.py") + sys.exit(1) + + +async def extract_data_from_pdf(agent: Agent, pdf_path: str) -> Dict[str, Any]: + """ + Extract structured data from a PDF document using the OpenAI responses API. + + Args: + agent: The agent to use for extraction + pdf_path: Path to the PDF file + + Returns: + Extracted structured data from the PDF + """ + # Read the PDF file and encode it as base64 + with open(pdf_path, "rb") as f: + pdf_data = f.read() + + pdf_base64 = base64.b64encode(pdf_data).decode("utf-8") + pdf_name = os.path.basename(pdf_path) + + # Define the extraction schema - modify this based on what you want to extract + extraction_schema = { + "title": "string", + "authors": ["string"], + "publication_date": "string", + "abstract": "string", + "sections": [ + { + "heading": "string", + "content": "string" + } + ], + "tables": [ + { + "caption": "string", + "data": [["string"]] + } + ], + "figures": [ + { + "caption": "string", + "description": "string" + } + ], + "references": ["string"] + } + + # Create the input with the PDF file + input_with_pdf = [ + { + "role": "user", + "content": [ + { + "type": "input_text", + "text": ( + "Extract the following information from the PDF document in a structured format:\n" + f"{json.dumps(extraction_schema, indent=2)}\n\n" + "Return the extracted data as a JSON object that follows this schema exactly." + ) + }, + { + "type": "input_file", + "filename": pdf_name, + "file_data": f"data:application/pdf;base64,{pdf_base64}" + } + ] + } + ] + + # Run the agent with the PDF input + result = await Runner.run(agent, input=input_with_pdf) + + # Extract the JSON response + response_text = result.final_output + + # Parse the JSON from the response text + # This handles cases where the model might include markdown code blocks + json_str = extract_json_from_text(response_text) + + try: + extracted_data = json.loads(json_str) + return extracted_data + except json.JSONDecodeError: + print("Failed to parse JSON response. Raw response:") + print(response_text) + return {"error": "Failed to parse response"} + + +def extract_json_from_text(text: str) -> str: + """ + Extract JSON string from text that might contain markdown or other formatting. + """ + # Check if the text contains a code block + if "```json" in text: + # Extract content between ```json and ``` + start = text.find("```json") + 7 + end = text.find("```", start) + return text[start:end].strip() + elif "```" in text: + # Extract content between ``` and ``` + start = text.find("```") + 3 + end = text.find("```", start) + return text[start:end].strip() + + # If no code block, try to find JSON object directly + # Look for the first { and the last } + start = text.find("{") + end = text.rfind("}") + 1 + + if start >= 0 and end > start: + return text[start:end].strip() + + # If all else fails, return the original text + return text + + +# Add a verification function to check if the extraction was successful +async def verify_extraction(agent: Agent, pdf_path: str, extracted_data: Dict[str, Any]) -> Dict[str, Any]: + """ + Verify if the extracted data is grounded in the PDF content. + + Args: + agent: The agent to use for verification + pdf_path: Path to the PDF file + extracted_data: The extracted data to verify + + Returns: + Verification results + """ + # Read the PDF file and encode it as base64 + with open(pdf_path, "rb") as f: + pdf_data = f.read() + + pdf_base64 = base64.b64encode(pdf_data).decode("utf-8") + pdf_name = os.path.basename(pdf_path) + + # Create the input with the PDF file and extracted data + input_with_pdf = [ + { + "role": "user", + "content": [ + { + "type": "input_text", + "text": ( + "Check if the following extracted data is grounded in the PDF content:\n\n" + f"Extracted data:\n{json.dumps(extracted_data, indent=2)}\n\n" + "Return a JSON object with the following structure:\n" + "{ \"is_grounded\": boolean, \"ungrounded_items\": [{ \"path\": \"path.to.item\", \"value\": \"extracted value\", \"issue\": \"description of issue\" }] }" + ) + }, + { + "type": "input_file", + "filename": pdf_name, + "file_data": f"data:application/pdf;base64,{pdf_base64}" + } + ] + } + ] + + # Run the agent with the PDF input + result = await Runner.run(agent, input=input_with_pdf) + + # Extract the JSON response + response_text = result.final_output + json_str = extract_json_from_text(response_text) + + try: + verification_result = json.loads(json_str) + return verification_result + except json.JSONDecodeError: + print("Failed to parse verification JSON. Raw response:") + print(response_text) + return {"error": "Failed to parse verification response"} + + +# Example usage with verification +async def extract_and_verify(): + # Set up the agent + set_default_openai_api("responses") + openai_api_key = os.environ.get("OPENAI_API_KEY") + if not openai_api_key: + raise ValueError("Please set the OPENAI_API_KEY environment variable") + + # Use the sample document created by the other script + current_dir = os.path.dirname(os.path.abspath(__file__)) + pdf_path = os.path.join(current_dir, "sample_document.pdf") + + if not os.path.exists(pdf_path): + print(f"Sample PDF not found at {pdf_path}") + print("Please run the sample_document.py script first:") + print("uv run examples/extract_doc/sample_document.py") + return None, None + + pdf_agent = Agent( + name="PDF Processing Agent", + instructions="An agent that extracts and verifies information from PDF documents.", + model="gpt-4o", + ) + + # Extract data + print("Extracting data from PDF...") + extracted_data = await extract_data_from_pdf(pdf_agent, pdf_path) + print("Extracted data:") + print(json.dumps(extracted_data, indent=2)) + + # Verify extraction + print("\nVerifying extraction...") + verification = await verify_extraction(pdf_agent, pdf_path, extracted_data) + print("Verification results:") + print(json.dumps(verification, indent=2)) + + return extracted_data, verification + + +if __name__ == "__main__": + asyncio.run(extract_and_verify()) diff --git a/examples/extract_doc/sample_document.pdf b/examples/extract_doc/sample_document.pdf new file mode 100644 index 00000000..b7e09415 Binary files /dev/null and b/examples/extract_doc/sample_document.pdf differ diff --git a/examples/extract_doc/sample_document.py b/examples/extract_doc/sample_document.py new file mode 100644 index 00000000..9118219b --- /dev/null +++ b/examples/extract_doc/sample_document.py @@ -0,0 +1,164 @@ +#!/usr/bin/env python3 +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "reportlab>=4.0.0", +# ] +# /// + +""" +Script to generate a sample PDF document for testing the PDF extraction agent. +""" + +import os +import sys + +try: + from reportlab.lib.pagesizes import letter + from reportlab.lib import colors + from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle + from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle +except ImportError: + print("Required packages not found. Please run this script with uv:") + print("uv run examples/extract_doc/sample_document.py") + sys.exit(1) + + +def create_sample_pdf(output_path): + """ + Create a sample PDF document with structured content for testing extraction. + """ + doc = SimpleDocTemplate(output_path, pagesize=letter) + styles = getSampleStyleSheet() + + # Create custom styles + title_style = styles["Title"] + heading_style = styles["Heading1"] + normal_style = styles["Normal"] + + # Create the content + content = [] + + # Title + content.append(Paragraph("Research on Machine Learning Applications in Healthcare", title_style)) + content.append(Spacer(1, 12)) + + # Authors + content.append(Paragraph("Authors: Jane Smith, John Doe, Alice Johnson", styles["Heading3"])) + content.append(Spacer(1, 12)) + + # Publication Date + content.append(Paragraph("Publication Date: March 15, 2025", styles["Heading3"])) + content.append(Spacer(1, 24)) + + # Abstract + content.append(Paragraph("Abstract", heading_style)) + content.append(Paragraph( + "This paper explores the applications of machine learning in healthcare, " + "focusing on diagnostic tools, treatment optimization, and patient monitoring systems. " + "We review recent advancements and discuss challenges and opportunities in this rapidly evolving field.", + normal_style + )) + content.append(Spacer(1, 12)) + + # Introduction + content.append(Paragraph("1. Introduction", heading_style)) + content.append(Paragraph( + "Machine learning has transformed healthcare in recent years, enabling more accurate " + "diagnoses, personalized treatment plans, and efficient resource allocation. " + "This paper provides an overview of current applications and future directions.", + normal_style + )) + content.append(Spacer(1, 12)) + + # Methods + content.append(Paragraph("2. Methods", heading_style)) + content.append(Paragraph( + "We conducted a systematic review of literature published between 2020 and 2025, " + "focusing on peer-reviewed articles describing machine learning applications in clinical settings. " + "Our analysis included both supervised and unsupervised learning approaches.", + normal_style + )) + content.append(Spacer(1, 12)) + + # Results + content.append(Paragraph("3. Results", heading_style)) + content.append(Paragraph( + "Our analysis identified three primary areas where machine learning has made significant impacts: " + "diagnostic assistance, treatment optimization, and patient monitoring. Each area shows promising " + "results but faces unique implementation challenges.", + normal_style + )) + content.append(Spacer(1, 12)) + + # Table: ML Applications + content.append(Paragraph("Table 1: Machine Learning Applications in Healthcare", styles["Heading3"])) + + table_data = [ + ['Application Area', 'ML Techniques', 'Accuracy Range', 'Implementation Status'], + ['Diagnostic Imaging', 'CNNs, Transfer Learning', '85-95%', 'Clinical Use'], + ['Treatment Planning', 'Reinforcement Learning, GBMs', '75-88%', 'Clinical Trials'], + ['Patient Monitoring', 'RNNs, LSTMs', '82-91%', 'Early Adoption'], + ['Drug Discovery', 'GANs, Autoencoders', '70-85%', 'Research Phase'] + ] + + table = Table(table_data, colWidths=[120, 120, 100, 120]) + table.setStyle(TableStyle([ + ('BACKGROUND', (0, 0), (-1, 0), colors.grey), + ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke), + ('ALIGN', (0, 0), (-1, -1), 'CENTER'), + ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'), + ('BOTTOMPADDING', (0, 0), (-1, 0), 12), + ('BACKGROUND', (0, 1), (-1, -1), colors.beige), + ('GRID', (0, 0), (-1, -1), 1, colors.black) + ])) + + content.append(table) + content.append(Spacer(1, 12)) + + # Discussion + content.append(Paragraph("4. Discussion", heading_style)) + content.append(Paragraph( + "While machine learning shows great promise in healthcare, several challenges remain. " + "These include data privacy concerns, model interpretability, regulatory approval processes, " + "and integration with existing clinical workflows. Future research should address these challenges " + "while expanding applications to underserved areas of medicine.", + normal_style + )) + content.append(Spacer(1, 12)) + + # Conclusion + content.append(Paragraph("5. Conclusion", heading_style)) + content.append(Paragraph( + "Machine learning continues to revolutionize healthcare by improving diagnostic accuracy, " + "treatment efficacy, and patient outcomes. As technology advances and more data becomes " + "available, we expect to see broader adoption and more sophisticated applications in clinical practice.", + normal_style + )) + content.append(Spacer(1, 12)) + + # References + content.append(Paragraph("References", heading_style)) + references = [ + "Smith, J. et al. (2023). Deep Learning for Medical Image Analysis. Journal of AI in Medicine, 45(2), 112-128.", + "Doe, J. & Johnson, A. (2024). Reinforcement Learning for Treatment Optimization. Healthcare Informatics Review, 18(3), 89-103.", + "Chen, X. et al. (2022). Patient Monitoring Systems Using Recurrent Neural Networks. IEEE Transactions on Medical Systems, 41(4), 215-230.", + "Williams, R. & Brown, T. (2025). Ethical Considerations in Healthcare AI. Bioethics Today, 12(1), 45-62.", + "Garcia, M. et al. (2021). Generative Models for Drug Discovery. Nature Machine Intelligence, 3(5), 375-390." + ] + + for ref in references: + content.append(Paragraph(ref, normal_style)) + content.append(Spacer(1, 6)) + + # Build the PDF + doc.build(content) + print(f"Sample PDF created at: {output_path}") + + +if __name__ == "__main__": + # Create the examples directory if it doesn't exist + output_dir = os.path.dirname(os.path.abspath(__file__)) + output_path = os.path.join(output_dir, "sample_document.pdf") + + create_sample_pdf(output_path)