-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_payslip_data_using_gpt.py
70 lines (61 loc) · 2.47 KB
/
extract_payslip_data_using_gpt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import openai
import pandas as pd
import re
# Set your OpenAI API key
openai.api_key = "your_openai_api_key"
def upload_image(file_path):
with open(file_path, "rb") as image_file:
response = openai.Image.create(file=image_file, purpose="answers")
file_id = response["id"]
return file_id
def create_prompt():
prompt = (
"Extract the following information from this payslip: "
"Employee ID, SSN, Current Gross Pay, Current Deductions, "
"Current Net Pay, and Current Pay Period."
)
return prompt
def extract_data_from_image(file_id, prompt):
response = openai.Completion.create(
model="gpt-4",
prompt=prompt,
max_tokens=500,
n=1,
stop=None,
temperature=0.5,
file=file_id
)
return response["choices"][0]["text"]
def parse_extracted_data(text):
data = {
"Employee ID": re.search(r"Employee ID: (\w+)", text).group(1),
"SSN": re.search(r"SSN: (\d+-\d+-\d+)", text).group(1),
"Current Gross Pay": re.search(r"Current Gross Pay: (\$?\d+(\.\d{2})?)", text).group(1),
"Current Deductions": re.search(r"Current Deductions: (\$?\d+(\.\d{2})?)", text).group(1),
"Current Net Pay": re.search(r"Current Net Pay: (\$?\d+(\.\d{2})?)", text).group(1),
"Current Pay Period": re.search(r"Current Pay Period: (\w+ \d+, \d+)", text).group(1)
}
return data
def save_data_to_excel(data, excel_path):
df = pd.DataFrame([data])
df.to_excel(excel_path, index=False)
print(f"Data successfully saved to {excel_path}.")
def main(image_path, excel_path):
# Step 1: Upload the image and get file ID
file_id = upload_image(image_path)
# Step 2: Create prompt for GPT
prompt = create_prompt()
# Step 3: Extract data using GPT API
extracted_text = extract_data_from_image(file_id, prompt)
# Step 4: Parse extracted data
parsed_data = parse_extracted_data(extracted_text)
# Step 5: Save parsed data to Excel
save_data_to_excel(parsed_data, excel_path)
# Example usage
image_path = 'payslip_image.png' # Path to your payslip image
excel_path = 'extracted_data.xlsx' # Path where you want to save the Excel file
main(image_path, excel_path)