pdf-extraction.ipynb์์๋ PDF์์ ํ ์คํธ, ์ด๋ฏธ์ง, ํ๋ฅผ ์ถ์ถํ๋ ๋ฐฉ๋ฒ์ ์ค๋ช ํ๊ณ ์์ต๋๋ค.
pypdf๋ฅผ ์ด์ฉํ์ฌ S3์ ์๋ pdf ํ์ผ์ ๋ก๋ํฉ๋๋ค.
import boto3
from pypdf import PdfReader
from io import BytesIO
s3r = boto3.resource("s3")
doc = s3r.Object(s3_bucket, key)
Byte_contents = doc.get()['Body'].read()
reader = PdfReader(BytesIO(Byte_contents))
ํ์ด์ง ๋จ์๋ก pdf์ text๋ฅผ extract_text()๋ก ์ฝ์ ํ์ contents๋ฅผ ์์ฑํ์ฌ ํ์ฉํฉ๋๋ค.
texts = []
for i, page in enumerate(reader.pages):
texts.append(page.extract_text())
contents = '\n'.join(texts)
pdf ํ์ผ์ ํฌํจ๋ ๋ชจ๋ ์ด๋ฏธ์ง๋ค์ ์ถ์ถํฉ๋๋ค.
files = []
image_files = extract_images_from_pdf(reader, key)
for img in image_files:
files.append(img)
def extract_images_from_pdf(reader, key):
picture_count = 1
extracted_image_files = []
for i, page in enumerate(reader.pages):
for image_file_object in page.images:
img_name = image_file_object.name
if img_name in extracted_image_files:
print('skip....')
continue
extracted_image_files.append(img_name)
ext = img_name.split('.')[-1]
contentType = ""
if ext == 'png':
contentType = 'image/png'
elif ext == 'jpg' or ext == 'jpeg':
contentType = 'image/jpeg'
elif ext == 'gif':
contentType = 'image/gif'
elif ext == 'bmp':
contentType = 'image/bmp'
elif ext == 'tiff' or ext == 'tif':
contentType = 'image/tiff'
elif ext == 'svg':
contentType = 'image/svg+xml'
elif ext == 'webp':
contentType = 'image/webp'
elif ext == 'ico':
contentType = 'image/x-icon'
elif ext == 'eps':
contentType = 'image/eps'
if contentType:
image_bytes = image_file_object.data
pixels = BytesIO(image_bytes)
pixels.seek(0, 0)
# get path from key
objectName = (key[key.find(s3_prefix)+len(s3_prefix)+1:len(key)])
folder = s3_prefix+'/files/'+objectName+'/'
img_key = folder+img_name
response = s3_client.put_object(
Bucket=s3_bucket,
Key=img_key,
ContentType=contentType,
Body=pixels
)
# metadata
img_meta = {
'bucket': s3_bucket,
'key': img_key,
'url': path+img_key,
'ext': 'png',
'page': i+1,
'original': key
}
picture_count += 1
extracted_image_files.append(img_key)
return extracted_image_files
ํ์ด์ง ๋จ์๋ก ์ด๋ฏธ์ง๋ฅผ ์ฒ๋ฆฌํ๊ณ ์ ํ ๋ ํ์ฉํฉ๋๋ค. ์์ธํ ๋ด์ฉ์ ์๋๋ฅผ ์ฐธ์กฐํฉ๋๋ค. fitz๋ฅผ ์ํด PyMuPDF๋ฅผ ์ค์นํ์ฌ์ผ ํฉ๋๋ค.
- ์๋์์๋ ํ ํ์ด์ง์ ์ด๋ฏธ์ง๊ฐ 4๊ฐ ์ด์ ์๋ ๊ฒฝ์ฐ์ page ๋จ์๋ก ์ด๋ฏธ์ง๋ฅผ ์ ์ฅํ๊ณ ์์ต๋๋ค.
- ๊ฐํน pdf์ ์ด๋ฏธ์ง๊ฐ ์์์๋ ์ด๋ฏธ์ง object ์ ๋ณด๊ฐ ์๋ ๊ฒฝ์ฐ์ ์ด๋ฏธ์ง๊ฐ 1๊ฐ ์ด์์ธ๋ฐ ๊ฐ๋ก ๋๋ ์ธ๋ก๊ฐ 100ํฝ์ ์ด์์ธ ๊ฒฝ์ฐ์ ํ์ผ๋ก ์ ์ฅํฉ๋๋ค.
- ์ด๋ฏธ์ง ์ฒ๋ฆฌ์ ํธ์๋ฅผ ์ํด์ dpi๋ 200์ผ๋ก ์ค์ ํ์์ต๋๋ค.
- S3์ ํ์ผ์ ๋ด์ฉ์ ํ์ธ์ pngํ์ผ์ด jpg๋ณด๋ค ํธ๋ฆฌํฉ๋๋ค. ๋ฐ๋ผ์ ํ์ผ ์ ์ฅ ํฌ๋งท์ png๋ก ํ๊ณ ์์ต๋๋ค.
import fitz
pages = fitz.open(stream=Byte_contents, filetype='pdf')
for i, page in enumerate(pages):
imgInfo = page.get_image_info()
width = height = 0
for j, info in enumerate(imgInfo):
bbox = info['bbox']
print(f"page[{i}] -> bbox[{j}]: {bbox}")
if (bbox[2]-bbox[0]>width or bbox[3]-bbox[1]>height) and (bbox[2]-bbox[0]<940 and bbox[3]-bbox[1]<520):
width = bbox[2]-bbox[0]
height = bbox[3]-bbox[1]
if nImages[i]>=4 or \
(nImages[i]>=1 and (width==0 and height==0)) or \
(nImages[i]>=1 and (width>=100 or height>=100)):
# save current pdf page to image
pixmap = page.get_pixmap(dpi=200) # dpi=300
#pixels = pixmap.tobytes() # output: jpg
# convert to png
img = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)
pixels = BytesIO()
img.save(pixels, format='PNG')
pixels.seek(0, 0)
# get path from key
objectName = (key[key.find(s3_prefix)+len(s3_prefix)+1:len(key)])
folder = s3_prefix+'/captures/'+objectName+'/'
fname = 'img_'+key.split('/')[-1].split('.')[0]+f"_{i}"
response = s3_client.put_object(
Bucket=s3_bucket,
Key=folder+fname+'.png',
ContentType='image/png',
Metadata = {
"ext": 'png',
"page": str(i)
},
Body=pixels
)
files.append(folder+fname+'.png')
prompt.pdf์ ๊ฐ์ด PDF์์ Header์ Footer๊ฐ ์์ ์ ์๊ณ , ํ๊ณผ 2๊ฐ์ ํ์ด์ง์ ๊ฑธ์ณ์ ์๋ค๋ฉด, ํ๋์ ํ๋ฅผ ๋ง๋ค๊ธฐ ์ํด, Header/Footer์ ์ ๊ฑฐ ๋ฐ 2๊ฐ๋ก ๋๋์ด์ง ํ๋ฅผ ํ๋๋ก ํฉ์น๋ ๊ณผ์ ์ด ํ์ํฉ๋๋ค. Header์ Footer์ ํฌ๊ธฐ๋ ๋ฌธ์๋ง๋ค ๋ค๋ฅผ์ ์๊ณ ๊ฐ์ ๋ฌธ์๋ผ๋ ํ์ด์ง๋ง๋ค ๋ค๋ฅผ ์ ์์ด์ ๋ฌธ์๋ง๋ค customize๊ฐ ํ์ํ๋ฉฐ ๋๋์ด์ง ํ๋ผ๋๊ฒ์ ์ธ์ํ๋๊ฒ๋ ๋ฌธ์๋ง๋ค ๋ค๋ฅผ ์ ์์ด์ ํ์คํ๋ ๋ฌธ์์ ๋ํด์๋ง ํ ์ด๋ธ ๊ฒฐํฉ์ด ๊ฐ๋ฅํฉ๋๋ค.
์๋๋ ํ ์ด๋ธ ๊ฒฐํฉ์ ํ ์์ ๋๋ค. ๋ค๋ง ์ด๊ฒ์ ๋ค์ํ ํฌ๋งท์ ๊ฐ์ง๊ณ ์๋ ๋ฌธ์๋ค์๋ ์ ์ฉ์ด ์ด๋ ต๊ณ , ํ์คํ๋ ๋ฌธ์์ ๋ํด์๋ง ์ ์ฉ ๊ฐ๋ฅํฉ๋๋ค.
Input (LLM + tool for SQL to DB) | Output (with SQL tool) โ Right answer |
---|---|
Calculate the price ratio for stock 'ABC' between 2023- 01-03 and 2023-01-04? | > Entering new AgentExecutor chain... I will need historical stock price data for the two dates Action: Stock DB Action Input: Price of ABC stock on 2023-01-03 and 2023-01-04 > Entering new SQLDatabaseChain chain...Price of ABC stock on 2023-01-03 and 2023-01-04 SQLQuery:SELECT price FROM stocks WHERE stock_ticker = "ABC" AND date BETWEEN "2023- 01-03" AND "2023-01-04" SQLResult: [(232.0,), (225.0,)] Answer: The price of ABC stock on January 03, 2023 was 232.0 and on January 04, 2023 was 225.0. > Finished chain. Observation: The price of ABC stock on January 03, 2023 was 232.0 and on January 04, 2023 was 225.0. Thought: Now I can compute the price ratio Final Answer: The price ratio for stock 'ABC' between 2023- 01-03 and 2023-01-04 is 232.0/225.0 = 1.0311 |
--- | --- |
fitz๋ก ์ถ์ถํ ํ์ด์ง์์ ์๋์ ๊ฐ์ด find_tables()๋ก ํ ์ด๋ธ ๊ฐ์ฒด๋ฅผ ์ฐพ์์ to_markdown()๋ก markdown ํํ๋ก ์ถ์ถํ ์ ์์ต๋๋ค.
table_md = []
for i, page in enumerate(pages):
tab = page.find_tables()
if tab.tables:
table_md.append(tab[0].to_markdown())
์ถ์ถ๋ markdown ํํ์ table์ RAG์ ๋ฌธ์๋ก ๋ฑ๋กํ ์ ์์ต๋๋ค.
ํ์ ๊ทธ๋ฆผ์ด ํฌํจ๋์ด ์๊ฑฐ๋ ํ์ ์์ฝ์ RAG์ ๋ฑ๋กํจ์ผ๋ก์จ RAG ๊ฒ์์ ์ ํ๋๋ฅผ ๋์ผ์ ์์ต๋๋ค. ๋ํ ์ถ์ถ๋ ์ด๋ฏธ์ง๋ ํ์ ๋ํ ๋งํฌ๋ฅผ ์์ฑํ ๋ ํ์ฉ๋ฉ๋๋ค.
tables = []
for i, page in enumerate(pages):
page_tables = page.find_tables()
if page_tables.tables:
tab = page_tables[0]
print(tab.to_markdown())
print(f"index: {i}")
print(f"bounding box: {tab.bbox}") # bounding box of the full table
print(f"top-left cell: {tab.cells[0]}") # top-left cell
print(f"bottom-right cell: {tab.cells[-1]}") # bottom-right cell
print(f"row count: {tab.row_count}, column count: {tab.col_count}") # row and column counts
print("\n\n")
extract_table_image(page, i, tab.bbox)
์ด๋, ์ด๋ฏธ์ง ์ถ์ถ์ ์ํ ํจ์๋ ์๋์ ๊ฐ์ต๋๋ค.
from PIL import Image
def extract_table_image(page, index, bbox):
pixmap_ori = page.get_pixmap()
print(f"width: {pixmap_ori.width}, height: {pixmap_ori.height}")
pixmap = page.get_pixmap(dpi=200) # dpi=300
#pixels = pixmap.tobytes() # output: jpg
# convert to png
img = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)
print(f"width: {pixmap.width}, height: {pixmap.height}")
rate_width = pixmap.width / pixmap_ori.width
rate_height = pixmap.height / pixmap_ori.height
print(f"rate_width={rate_width}, rate_height={rate_height}")
crop_img = img.crop((bbox[0]*rate_width, bbox[1]*rate_height, bbox[2]*rate_width, bbox[3]*rate_height))
pixels = BytesIO()
crop_img.save(pixels, format='PNG')
pixels.seek(0, 0)
# get path from key
objectName = (key[key.find(s3_prefix)+len(s3_prefix)+1:len(key)])
folder = s3_prefix+'/captures/'+objectName+'/'
fname = 'table_'+key.split('/')[-1].split('.')[0]+f"_{index}"
response = s3_client.put_object(
Bucket=s3_bucket,
Key=folder+fname+'.png',
ContentType='image/png',
Metadata = {
"ext": 'png',
"page": str(index)
},
Body=pixels
)
files.append(folder+fname+'.png')
How to use Markdown output์ ๊ฐ์ด pdf๋ฅผ markdown output์ผ๋ก ์ ์ฅํ ์ ์์ต๋๋ค.
import pymupdf4llm
from langchain.text_splitter import MarkdownTextSplitter
# Get the MD text
md_text = pymupdf4llm.to_markdown("input.pdf") # get markdown for all pages
splitter = MarkdownTextSplitter(chunk_size=40, chunk_overlap=0)
splitter.create_documents([md_text])
Table Recognition and Extraction With PyMuPDF
How can I extract semi structured tables from PDF using pdfplumber
How do I extract a table from a pdf file using pymupdf
How to save a pandas DataFrame table as a png
PyMuPDF4LLM์ ํ์ฉํ PDF ํ์ฑ ๋ฐ FAISS ๋ฒกํฐ ์คํ ์ด๋ฅผ ์ฌ์ฉํ RAG
Generate Synthetic QnAs from Real-world Data