-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathparser2.py
144 lines (126 loc) · 6.78 KB
/
parser2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import os
import json
import subprocess
import time
from typing import List, Dict
import re
def split_into_job_chunks(text: str) -> List[str]:
"""Split text into chunks based on MEM- pattern while keeping each job listing intact."""
# Split on MEM- pattern but keep the delimiter
chunks = re.split(r'(?=MEM-)', text)[1:]
# Remove empty chunks and strip whitespace
return [chunk.strip() for chunk in chunks if chunk.strip()]
def process_chunk(chunk: str, example_text: str, filename: str) -> List[Dict]:
"""Process a single chunk using the LLM."""
try:
# Write chunk to temporary file
with open("temp_chunk.txt", "w", encoding="utf-8") as f:
f.write(filename + "\n" + chunk)
command = f"""cat temp_chunk.txt | llm --system $'You are an expert in parsing congressional job listings from text files split into chunks. Extract job listings into a structured JSON array where each listing is an object with consistent fields. Handle the following formatting requirements: \\
1. Convert any non-ASCII characters \\(including smartquotes, em-dashes, etc.\\) to their closest ASCII equivalent \\
2. Normalize bullet points and list formatting into consistent structures \\
3. Maintain hierarchical relationships in nested lists \\
4. Preserve paragraph breaks in description fields \\
5. Format all dates in ISO 8601 format \\(YYYY-MM-DD\\)' \\
-m gemini-2.0-flash-exp -o json_object 1 \\
$'Create a JSON array containing objects for each job listing with the following required fields: \\
- id: Job ID in format "MEM-XXX-YY", where XXX is the number from the listing \\(do not use literally XXX\\). do not carry over IDs from one chunk to the next. \\
- position_title: Full position title \\
- office: Congressional office or committee name - do not use placeholders \\
- location: Primary work location \\
- posting_date: Date from filename \\(format: YYYY-MM-DD\\) \\
- description: Full job description \\
- responsibilities: Array of responsibilities \\
- qualifications: Array of required qualifications \\
- how_to_apply: Application instructions \\
- salary_info: Salary information if provided \\(use null if not specified\\) \\
- contact: Contact information for applications \\
- equal_opportunity: Equal opportunity statement if present \\(use null if not specified\\) \\
Do not process introductory boilerplate language or subscribe/unsubscribe sections as job listings. Format all text fields as UTF-8 strings, convert bullet points to array elements, and maintain paragraph structure in longer text fields. Remove any formatting characters while preserving the semantic structure of the content.'"""
# Run command and capture both stdout and stderr
result = subprocess.run(
command,
shell=True,
capture_output=True,
text=True,
check=True
)
# Print command output for debugging if needed
if result.stderr:
print(f"Command stderr: {result.stderr}")
try:
parsed_jobs = json.loads(result.stdout)
return parsed_jobs
except json.JSONDecodeError as je:
print(f"JSON Decode Error: {je}")
print(f"Raw output that failed to parse: {result.stdout[:500]}...") # Print first 500 chars
return []
except subprocess.CalledProcessError as ce:
print(f"Command failed with return code: {ce.returncode}")
print(f"Command stdout: {ce.stdout}")
print(f"Command stderr: {ce.stderr}")
print(f"Failed command: {command}")
return []
except Exception as e:
print(f"Unexpected error: {str(e)}")
print(f"Error type: {type(e).__name__}")
import traceback
print(f"Full traceback: {traceback.format_exc()}")
return []
finally:
# Clean up temporary file
if os.path.exists("temp_chunk.txt"):
try:
os.remove("temp_chunk.txt")
except Exception as e:
print(f"Error removing temporary file: {e}")
def main():
directory_path = "output"
output_dir = "json_gemini_flash" # change this depending on the llm
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Get list of already processed files
output_files = {f.split('.json')[0] for f in os.listdir(output_dir)}
# Example template for JSON structure
example = [{
"date": "2024-11-04",
"id": "MEM-458-24",
"office": "Congressman Steven Horsford",
"position": "District Representative",
"location": "North Las Vegas, Nevada",
"key_responsibilities": [
"strengthening relationships with key stakeholders, as well as attending engagements on behalf of the Member",
"helping constituents resolve problems with federal agencies through written and verbal communication"
],
"requirements": [
"Political knowledge and comfortable navigating complicated situations",
"Strong written, verbal, analytical, and organization skills; impeccable customer service manners; public speaking skills"
],
"job_type": "Full-time",
"salary": "Commensurate with experience",
"email": "[email protected]"
}]
example_text = json.dumps(example)
for filename in os.listdir(directory_path):
if 'Member' in filename and filename.endswith(".txt") and filename.split(".txt")[0] not in output_files:
print(f"Processing {filename}")
# Read the entire file
with open(os.path.join(directory_path, filename), "r", encoding="utf-8") as f:
text_string = f.read()
# Split into manageable chunks
chunks = split_into_job_chunks(text_string)
all_jobs = []
# Process each chunk
for i, chunk in enumerate(chunks):
print(f"Processing chunk {i+1}/{len(chunks)} of {filename}")
time.sleep(8) # Rate limiting
jobs = process_chunk(chunk, example_text, filename)
all_jobs.extend(jobs)
# Write combined results to output file
output_path = os.path.join(output_dir, filename.replace('.txt', '.json'))
with open(output_path, "w", encoding="utf-8") as f:
json.dump(all_jobs, f, indent=2, ensure_ascii=False)
print(f"Completed processing {filename}")
time.sleep(5) # Rate limiting between files
if __name__ == "__main__":
main()