-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
130 lines (105 loc) · 5.27 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# src/cli.py
import argparse
import os
from typing import Optional
from pydantic import BaseModel, Field
from src.services.study_set_creator import StudySetCreator
from src.utils.config import get_api_key
from src.utils.logging import get_logger
logger = get_logger()
class CLIArguments(BaseModel):
input: Optional[str] = Field(None, description="Path to the PDF file")
output: Optional[str] = Field(None, description="Output CSV file name")
in_dir: Optional[str] = Field(None, description="Path to the input directory containing PDF files")
out_dir: Optional[str] = Field(None, description="Path to the output directory where CSV files will be saved")
model: str = Field("gpt-4o-mini", description="OpenAI model to use")
chunk_size: int = Field(10, description="Number of pages to process at once")
use_batch: bool = Field(False, description="Use OpenAI Batch API for processing")
text_only: bool = Field(False, description="Extract text only, ignore images")
language: str = Field("english", description="Language for the study set")
no_resume: bool = Field(False,
description="Whether to resume processing from the last checkpoint. WARNING: If set and a progress file exists, it will be overwritten.")
def parse_arguments() -> CLIArguments:
"""Parse command-line arguments and return a CLIArguments object."""
parser = argparse.ArgumentParser(
description="Create a study set from a PDF file or directory of PDFs.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
# Define arguments
parser.add_argument("--input", type=str, help="Path to the PDF file.")
parser.add_argument("--output", type=str, help="Output CSV file name.")
parser.add_argument("--in_dir", type=str, help="Path to the input directory containing PDF files.")
parser.add_argument("--out_dir", type=str, help="Path to the output directory where CSV files will be saved.")
parser.add_argument("--model", type=str, default="gpt-4o-mini", help="OpenAI model to use.")
parser.add_argument("--chunk_size", type=int, default=10, help="Number of pages to process at once.")
parser.add_argument("--use_batch", action="store_true", help="Use OpenAI Batch API for processing.")
parser.add_argument("--text_only", action="store_true", help="Extract text only, ignore images.")
parser.add_argument("--language", type=str, default="english", help="Language for the study set.")
parser.add_argument(
"--no_resume",
action="store_true",
default=False,
help="Whether to resume processing from the last checkpoint. WARNING: If set and a progress file exists, it will be overwritten."
)
args = parser.parse_args()
# Argument validation
if (args.input is not None) != (args.output is not None):
parser.error("Both --input and --output must be provided together.")
if (args.in_dir is not None) != (args.out_dir is not None):
parser.error("Both --in_dir and --out_dir must be provided together.")
if (args.input and args.output) and (args.in_dir and args.out_dir):
parser.error("Cannot use both --input/--output and --in_dir/--out_dir at the same time.")
if not ((args.input and args.output) or (args.in_dir and args.out_dir)):
parser.error("You must provide either both --input and --output, or both --in_dir and --out_dir.")
return CLIArguments(**vars(args))
def main() -> None:
"""
Main function to create a study set from a PDF file or directory of PDFs.
This function parses command-line arguments, initializes the StudySetCreator,
and generates the study set based on the provided PDF file(s) and options.
"""
args = parse_arguments()
api_key = get_api_key()
if not api_key:
logger.error("API key not found. Please set it in the .env file.")
return
if args.input:
# Single file processing
creator = StudySetCreator(
api_key=api_key,
model=args.model,
output_csv=args.output,
chunk_size=args.chunk_size,
use_batch=args.use_batch,
language=args.language,
no_resume=args.no_resume
)
creator.to_study_set(args.input, args.text_only)
elif args.in_dir:
# Multiple files processing
os.makedirs(args.out_dir, exist_ok=True)
pdf_files = [f for f in os.listdir(args.in_dir) if f.lower().endswith('.pdf')]
if not pdf_files:
logger.error(f"No PDF files found in directory {args.in_dir}")
return
pdf_paths = [os.path.join(args.in_dir, pdf_file) for pdf_file in pdf_files]
output_paths = [os.path.join(args.out_dir, os.path.splitext(pdf_file)[0] + '.csv') for pdf_file in pdf_files]
creator = StudySetCreator(
api_key=api_key,
model=args.model,
chunk_size=args.chunk_size,
use_batch=args.use_batch,
language=args.language,
no_resume=args.no_resume
)
creator.process_multiple_pdfs(
pdf_paths=pdf_paths,
output_paths=output_paths,
text_only=args.text_only
)
else:
# Should not reach here
logger.error("Invalid arguments provided.")
return
if __name__ == "__main__":
main()