Skip to content

Latest commit

ย 

History

History
320 lines (223 loc) ยท 12.7 KB

File metadata and controls

320 lines (223 loc) ยท 12.7 KB

PDF์—์„œ ํ…์ŠคํŠธ, ์ด๋ฏธ์ง€, ํ…Œ์ด๋ธ” ์ •๋ณด๋ฅผ ์ถ”์ถœํ•˜๊ธฐ

pdf-extraction.ipynb์—์„œ๋Š” PDF์—์„œ ํ…์ŠคํŠธ, ์ด๋ฏธ์ง€, ํ‘œ๋ฅผ ์ถ”์ถœํ•˜๋Š” ๋ฐฉ๋ฒ•์„ ์„ค๋ช…ํ•˜๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค.

S3๋กœ๋ถ€ํ„ฐ PDF Loadingํ•˜๊ธฐ

pypdf๋ฅผ ์ด์šฉํ•˜์—ฌ S3์— ์žˆ๋Š” pdf ํŒŒ์ผ์„ ๋กœ๋“œํ•ฉ๋‹ˆ๋‹ค.

import boto3
from pypdf import PdfReader      
from io import BytesIO

s3r = boto3.resource("s3")
doc = s3r.Object(s3_bucket, key)

Byte_contents = doc.get()['Body'].read()

reader = PdfReader(BytesIO(Byte_contents))

ํ…์ŠคํŠธ ์ถ”์ถœ

ํŽ˜์ด์ง€ ๋‹จ์œ„๋กœ pdf์˜ text๋ฅผ extract_text()๋กœ ์ฝ์€ ํ›„์— contents๋ฅผ ์ƒ์„ฑํ•˜์—ฌ ํ™œ์šฉํ•ฉ๋‹ˆ๋‹ค.

texts = []

for i, page in enumerate(reader.pages):
    texts.append(page.extract_text())
    
    contents = '\n'.join(texts)    

์ด๋ฏธ์ง€ ์ถ”์ถœ

๊ฐœ๋ณ„ ์ด๋ฏธ์ง€ ์ถ”์ถœํ•˜๊ธฐ

pdf ํŒŒ์ผ์— ํฌํ•จ๋œ ๋ชจ๋“  ์ด๋ฏธ์ง€๋“ค์„ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค.

files = []

image_files = extract_images_from_pdf(reader, key)
for img in image_files:
    files.append(img)

def extract_images_from_pdf(reader, key):
    picture_count = 1
    
    extracted_image_files = []
    for i, page in enumerate(reader.pages):        
        for image_file_object in page.images:
            img_name = image_file_object.name
            if img_name in extracted_image_files:
                print('skip....')
                continue
            
            extracted_image_files.append(img_name)
            
            ext = img_name.split('.')[-1]            
            contentType = ""
            if ext == 'png':
                contentType = 'image/png'
            elif ext == 'jpg' or ext == 'jpeg':
                contentType = 'image/jpeg'
            elif ext == 'gif':
                contentType = 'image/gif'
            elif ext == 'bmp':
                contentType = 'image/bmp'
            elif ext == 'tiff' or ext == 'tif':
                contentType = 'image/tiff'
            elif ext == 'svg':
                contentType = 'image/svg+xml'
            elif ext == 'webp':
                contentType = 'image/webp'
            elif ext == 'ico':
                contentType = 'image/x-icon'
            elif ext == 'eps':
                contentType = 'image/eps'
            
            if contentType:                
                image_bytes = image_file_object.data

                pixels = BytesIO(image_bytes)
                pixels.seek(0, 0)
                            
                # get path from key
                objectName = (key[key.find(s3_prefix)+len(s3_prefix)+1:len(key)])
                folder = s3_prefix+'/files/'+objectName+'/'
                            
                img_key = folder+img_name                
                response = s3_client.put_object(
                    Bucket=s3_bucket,
                    Key=img_key,
                    ContentType=contentType,
                    Body=pixels
                )
                            
                # metadata
                img_meta = {   
                    'bucket': s3_bucket,
                    'key': img_key,
                    'url': path+img_key,
                    'ext': 'png',
                    'page': i+1,
                    'original': key
                }                            
                picture_count += 1                    
                extracted_image_files.append(img_key)

    return extracted_image_files

ํŽ˜์ด์ง€ ๋‹จ์œ„๋กœ ์ด๋ฏธ์ง€ ์ €์žฅํ•˜๊ธฐ

ํŽ˜์ด์ง€ ๋‹จ์œ„๋กœ ์ด๋ฏธ์ง€๋ฅผ ์ฒ˜๋ฆฌํ•˜๊ณ ์ž ํ• ๋•Œ ํ™œ์šฉํ•ฉ๋‹ˆ๋‹ค. ์ƒ์„ธํ•œ ๋‚ด์šฉ์€ ์•„๋ž˜๋ฅผ ์ฐธ์กฐํ•ฉ๋‹ˆ๋‹ค. fitz๋ฅผ ์œ„ํ•ด PyMuPDF๋ฅผ ์„ค์น˜ํ•˜์—ฌ์•ผ ํ•ฉ๋‹ˆ๋‹ค.

  • ์•„๋ž˜์—์„œ๋Š” ํ•œ ํŽ˜์ด์ง€์— ์ด๋ฏธ์ง€๊ฐ€ 4๊ฐœ ์ด์ƒ ์žˆ๋Š” ๊ฒฝ์šฐ์— page ๋‹จ์œ„๋กœ ์ด๋ฏธ์ง€๋ฅผ ์ €์žฅํ•˜๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค.
  • ๊ฐ„ํ˜น pdf์— ์ด๋ฏธ์ง€๊ฐ€ ์žˆ์Œ์—๋„ ์ด๋ฏธ์ง€ object ์ •๋ณด๊ฐ€ ์—†๋Š” ๊ฒฝ์šฐ์™€ ์ด๋ฏธ์ง€๊ฐ€ 1๊ฐœ ์ด์ƒ์ธ๋ฐ ๊ฐ€๋กœ ๋˜๋Š” ์„ธ๋กœ๊ฐ€ 100ํ”ฝ์…€ ์ด์ƒ์ธ ๊ฒฝ์šฐ์— ํŒŒ์ผ๋กœ ์ €์žฅํ•ฉ๋‹ˆ๋‹ค.
  • ์ด๋ฏธ์ง€ ์ฒ˜๋ฆฌ์˜ ํŽธ์˜๋ฅผ ์œ„ํ•ด์„œ dpi๋Š” 200์œผ๋กœ ์„ค์ •ํ•˜์˜€์Šต๋‹ˆ๋‹ค.
  • S3์— ํŒŒ์ผ์˜ ๋‚ด์šฉ์„ ํ™•์ธ์‹œ pngํŒŒ์ผ์ด jpg๋ณด๋‹ค ํŽธ๋ฆฌํ•ฉ๋‹ˆ๋‹ค. ๋”ฐ๋ผ์„œ ํŒŒ์ผ ์ €์žฅ ํฌ๋งท์„ png๋กœ ํ•˜๊ณ  ์žˆ์Šต๋‹ˆ๋‹ค.
import fitz

pages = fitz.open(stream=Byte_contents, filetype='pdf')      
            
for i, page in enumerate(pages):
    imgInfo = page.get_image_info()
    width = height = 0
    for j, info in enumerate(imgInfo):
        bbox = info['bbox']
        print(f"page[{i}] -> bbox[{j}]: {bbox}")
        if (bbox[2]-bbox[0]>width or bbox[3]-bbox[1]>height) and (bbox[2]-bbox[0]<940 and bbox[3]-bbox[1]<520):
            width = bbox[2]-bbox[0]
            height = bbox[3]-bbox[1]
                        
    if nImages[i]>=4 or \
        (nImages[i]>=1 and (width==0 and height==0)) or \
        (nImages[i]>=1 and (width>=100 or height>=100)):
        # save current pdf page to image 
        pixmap = page.get_pixmap(dpi=200)  # dpi=300
        #pixels = pixmap.tobytes() # output: jpg
                        
        # convert to png
        img = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)
        pixels = BytesIO()
        img.save(pixels, format='PNG')
        pixels.seek(0, 0)

        # get path from key
        objectName = (key[key.find(s3_prefix)+len(s3_prefix)+1:len(key)])
        folder = s3_prefix+'/captures/'+objectName+'/'
                                
        fname = 'img_'+key.split('/')[-1].split('.')[0]+f"_{i}"

        response = s3_client.put_object(
        Bucket=s3_bucket,
            Key=folder+fname+'.png',
            ContentType='image/png',
            Metadata = {
                "ext": 'png',
                "page": str(i)
            },
            Body=pixels
        )
                                                        
        files.append(folder+fname+'.png')                                    

ํ…Œ์ด๋ธ” ์ถ”์ถœ

PDF์—์„œ ํ…Œ์ด๋ธ” ์ถ”์ถœ์˜ ์–ด๋ ค์›€

prompt.pdf์™€ ๊ฐ™์ด PDF์•ˆ์— Header์™€ Footer๊ฐ€ ์žˆ์„ ์ˆ˜ ์žˆ๊ณ , ํ‘œ๊ณผ 2๊ฐœ์˜ ํŽ˜์ด์ง€์— ๊ฑธ์ณ์„œ ์žˆ๋‹ค๋ฉด, ํ•˜๋‚˜์˜ ํ‘œ๋ฅผ ๋งŒ๋“ค๊ธฐ ์œ„ํ•ด, Header/Footer์˜ ์ œ๊ฑฐ ๋ฐ 2๊ฐœ๋กœ ๋‚˜๋ˆ„์–ด์ง„ ํ‘œ๋ฅผ ํ•˜๋‚˜๋กœ ํ•ฉ์น˜๋Š” ๊ณผ์ •์ด ํ•„์š”ํ•ฉ๋‹ˆ๋‹ค. Header์™€ Footer์˜ ํฌ๊ธฐ๋Š” ๋ฌธ์„œ๋งˆ๋‹ค ๋‹ค๋ฅผ์ˆ˜ ์žˆ๊ณ  ๊ฐ™์€ ๋ฌธ์„œ๋ผ๋„ ํŽ˜์ด์ง€๋งˆ๋‹ค ๋‹ค๋ฅผ ์ˆ˜ ์žˆ์–ด์„œ ๋ฌธ์„œ๋งˆ๋‹ค customize๊ฐ€ ํ•„์š”ํ•˜๋ฉฐ ๋‚˜๋ˆ„์–ด์ง„ ํ‘œ๋ผ๋Š”๊ฒƒ์„ ์ธ์‹ํ•˜๋Š”๊ฒƒ๋„ ๋ฌธ์„œ๋งˆ๋‹ค ๋‹ค๋ฅผ ์ˆ˜ ์žˆ์–ด์„œ ํ‘œ์ค€ํ™”๋œ ๋ฌธ์„œ์— ๋Œ€ํ•ด์„œ๋งŒ ํ…Œ์ด๋ธ” ๊ฒฐํ•ฉ์ด ๊ฐ€๋Šฅํ•ฉ๋‹ˆ๋‹ค.

image

์•„๋ž˜๋Š” ํ…Œ์ด๋ธ” ๊ฒฐํ•ฉ์„ ํ•œ ์˜ˆ์ž…๋‹ˆ๋‹ค. ๋‹ค๋งŒ ์ด๊ฒƒ์€ ๋‹ค์–‘ํ•œ ํฌ๋งท์„ ๊ฐ€์ง€๊ณ  ์žˆ๋Š” ๋ฌธ์„œ๋“ค์—๋Š” ์ ์šฉ์ด ์–ด๋ ต๊ณ , ํ‘œ์ค€ํ™”๋œ ๋ฌธ์„œ์— ๋Œ€ํ•ด์„œ๋งŒ ์ ์šฉ ๊ฐ€๋Šฅํ•ฉ๋‹ˆ๋‹ค.

Input (LLM + tool for SQL to DB) Output (with SQL tool) โ€“ Right answer
Calculate the price ratio for stock 'ABC' between 2023&#45; 01&#45;03 and 2023&#45;01&#45;04? > Entering new AgentExecutor chain... I will need historical stock price data for the two dates Action: Stock DB Action Input: Price of ABC stock on 2023&#45;01&#45;03 and 2023&#45;01&#45;04 > Entering new SQLDatabaseChain chain...Price of ABC stock on 2023&#45;01&#45;03 and 2023&#45;01&#45;04 SQLQuery:SELECT price FROM stocks WHERE stock_ticker = "ABC" AND date BETWEEN "2023&#45; 01&#45;03" AND "2023&#45;01&#45;04" SQLResult: [(232.0,), (225.0,)] Answer: The price of ABC stock on January 03, 2023 was 232.0 and on January 04, 2023 was 225.0. > Finished chain. Observation: The price of ABC stock on January 03, 2023 was 232.0 and on January 04, 2023 was 225.0. Thought: Now I can compute the price ratio Final Answer: The price ratio for stock 'ABC' between 2023&#45; 01&#45;03 and 2023&#45;01&#45;04 is 232.0/225.0 = 1.0311
--- ---

MarkDown ํ˜•์‹

fitz๋กœ ์ถ”์ถœํ•œ ํŽ˜์ด์ง€์—์„œ ์•„๋ž˜์™€ ๊ฐ™์ด find_tables()๋กœ ํ…Œ์ด๋ธ” ๊ฐ์ฒด๋ฅผ ์ฐพ์•„์„œ to_markdown()๋กœ markdown ํ˜•ํƒœ๋กœ ์ถ”์ถœํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.

table_md = []
for i, page in enumerate(pages):
    tab = page.find_tables()
    if tab.tables:
        table_md.append(tab[0].to_markdown())

์ถ”์ถœ๋œ markdown ํ˜•ํƒœ์˜ table์€ RAG์— ๋ฌธ์„œ๋กœ ๋“ฑ๋กํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.

์ด๋ฏธ์ง€ ํ˜•์‹

ํ‘œ์— ๊ทธ๋ฆผ์ด ํฌํ•จ๋˜์–ด ์žˆ๊ฑฐ๋‚˜ ํ‘œ์˜ ์š”์•ฝ์„ RAG์— ๋“ฑ๋กํ•จ์œผ๋กœ์จ RAG ๊ฒ€์ƒ‰์˜ ์ •ํ™•๋„๋ฅผ ๋†’์ผ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. ๋˜ํ•œ ์ถ”์ถœ๋œ ์ด๋ฏธ์ง€๋Š” ํ‘œ์— ๋Œ€ํ•œ ๋งํฌ๋ฅผ ์ƒ์„ฑํ•  ๋•Œ ํ™œ์šฉ๋ฉ๋‹ˆ๋‹ค.

tables = []
for i, page in enumerate(pages):
    page_tables = page.find_tables()
    
    if page_tables.tables:
        tab = page_tables[0]
        
        print(tab.to_markdown())
    
        print(f"index: {i}")
        print(f"bounding box: {tab.bbox}")  # bounding box of the full table
        print(f"top-left cell: {tab.cells[0]}")  # top-left cell
        print(f"bottom-right cell: {tab.cells[-1]}")  # bottom-right cell
        print(f"row count: {tab.row_count}, column count: {tab.col_count}") # row and column counts
        print("\n\n")
        
        extract_table_image(page, i, tab.bbox)

์ด๋•Œ, ์ด๋ฏธ์ง€ ์ถ”์ถœ์„ ์œ„ํ•œ ํ•จ์ˆ˜๋Š” ์•„๋ž˜์™€ ๊ฐ™์Šต๋‹ˆ๋‹ค.

from PIL import Image

def extract_table_image(page, index, bbox):
    pixmap_ori = page.get_pixmap()
    print(f"width: {pixmap_ori.width}, height: {pixmap_ori.height}")
        
    pixmap = page.get_pixmap(dpi=200)  # dpi=300
    #pixels = pixmap.tobytes() # output: jpg
    
    # convert to png
    img = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)
    print(f"width: {pixmap.width}, height: {pixmap.height}")
    
    rate_width = pixmap.width / pixmap_ori.width
    rate_height = pixmap.height / pixmap_ori.height
    print(f"rate_width={rate_width}, rate_height={rate_height}")
    
    crop_img = img.crop((bbox[0]*rate_width, bbox[1]*rate_height, bbox[2]*rate_width, bbox[3]*rate_height))
    
    pixels = BytesIO()
    crop_img.save(pixels, format='PNG')
    pixels.seek(0, 0)

    # get path from key
    objectName = (key[key.find(s3_prefix)+len(s3_prefix)+1:len(key)])
    folder = s3_prefix+'/captures/'+objectName+'/'
                                
    fname = 'table_'+key.split('/')[-1].split('.')[0]+f"_{index}"

    response = s3_client.put_object(
    Bucket=s3_bucket,
        Key=folder+fname+'.png',
        ContentType='image/png',
        Metadata = {
            "ext": 'png',
            "page": str(index)
        },
        Body=pixels
    )
                                                        
    files.append(folder+fname+'.png')

MarkDown Output

How to use Markdown output์™€ ๊ฐ™์ด pdf๋ฅผ markdown output์œผ๋กœ ์ €์žฅํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.

import pymupdf4llm
from langchain.text_splitter import MarkdownTextSplitter

# Get the MD text
md_text = pymupdf4llm.to_markdown("input.pdf")  # get markdown for all pages

splitter = MarkdownTextSplitter(chunk_size=40, chunk_overlap=0)

splitter.create_documents([md_text])

Reference

Table Recognition and Extraction With PyMuPDF

How can I extract semi structured tables from PDF using pdfplumber

How do I extract a table from a pdf file using pymupdf

How to save a pandas DataFrame table as a png

Preprocessing for complex PDF

PyMuPDF4LLM์„ ํ™œ์šฉํ•œ PDF ํŒŒ์‹ฑ ๋ฐ FAISS ๋ฒกํ„ฐ ์Šคํ† ์–ด๋ฅผ ์‚ฌ์šฉํ•œ RAG

Generate Synthetic QnAs from Real-world Data

Generate QnA synthetic dataset from a Complex PDF

Introducing PyMuPDF4LLM

Welcome to PyMuPDF

find_table()