-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathactivity_parser.py
24 lines (17 loc) · 958 Bytes
/
activity_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# activity_parser.py
from utils.pdf_utils import pdf_to_markdown
from utils.llm_utils import content_to_dict, configure_genai
from utils.file_utils import read_text_file, write_json_file
from constants import GEMINI_API_KEY, GEMINI_MODEL_NAME
def extract_activity_data(pdf_file, assay_page_start, assay_page_end, assay_name, compound_id_list, output_dir, lang='en'):
# Parse the PDF file to Markdown
assay_md_file = pdf_to_markdown(pdf_file, output_dir, page_start=assay_page_start, page_end=assay_page_end, lang=lang)
# Read the content of the Markdown file
content = read_text_file(assay_md_file)
# Configure the AI client
configure_genai(GEMINI_API_KEY)
assay_dict = content_to_dict(content, assay_name, compound_id_list=compound_id_list, model_name=GEMINI_MODEL_NAME)
# Save assay_dict to JSON file
output_json = f'{output_dir}/assay_data.json'
write_json_file(output_json, assay_dict)
return assay_dict