-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path06_postgpt.py
99 lines (81 loc) · 4.23 KB
/
06_postgpt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import os
import logging
from datetime import datetime
from time import time
import pandas as pd
from postgpt__utils import *
import numpy as np
import pathlib
cur_directory = os.path.dirname(os.path.abspath(__file__))
relative_log_path = 'logs/'
relative_output_path = 'outputs/intermediate/'
relative_input_path = 'inputs/'
relative_finaloutput_directory = 'outputs/final/'
log_path = os.path.join(cur_directory, relative_log_path)
output_path = os.path.join(cur_directory, relative_output_path)
input_path = os.path.join(cur_directory, relative_input_path)
final_output_path = os.path.join(cur_directory, relative_finaloutput_directory)
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
TIMESTAMP=datetime.now().strftime("%Y-%m-%d_%I-%M-%S-%p")
# Assign a file-handler to that instance
fh = logging.FileHandler(log_path + "log_postgpt_dataprocess_{}.txt".format(TIMESTAMP))
fh.setLevel(logging.INFO)
# Add the handler to your logging instance
logger.addHandler(fh)
# Read in long file with missing sements
filename = "full_df_and_missing_segments5.xlsx"
print("reading in ", filename)
#load initial files
df = pd.read_excel(filename)
df['topic_origin']="ChatGPT Topic Match"
df_bill_numbers = pd.read_excel(input_path + "2023_bill_number_tagging_vF.xlsx",dtype='str',header=0)
df['topic'] = df['topic'].astype('str')
df['topic'] = df['topic'].apply(lambda x: x.title())
df_bill_numbers['topic'] = df_bill_numbers['topic'].apply(lambda x: x.title())
df_bill_numbers = df_bill_numbers[['topic','bill_number']]
bill_number="130[0-9].[0-9]+"
# Create a topic crosswalk output for OHS
results, results_two = output_topic_crosswalk(df,df_bill_numbers)
with pd.ExcelWriter(final_output_path + '/topic_eval_for_OHS.xlsx') as writer:
# Write each DataFrame to a different Excel sheet
results.to_excel(writer, sheet_name='topic_count', index=False,)
results_two.to_excel(writer, sheet_name='topic_correlation', index=False)
# Add exact matches: must be in Title case
term_dict = {"Disabilities, IDEA, 504 Plan and Individualized Education Plan":["504"],
"Data Privacy And FERPA": ["FERPA"]}
#exact terms
df = add_exact_terms(df, term_dict)
#clean bill numbers
df = manage_add_bill_numbers(df, df_bill_numbers,bill_number)
#clean output
df = clean_output(df, df_bill_numbers)
#roll up topics
df = roll_up(df)
df.sort_values(by=['text_chunk_id','segment_number','topic'],axis=0,na_position='last', inplace=True)
#extra cleaning
original_topics = df_bill_numbers.topic.to_list()
df['is_other_topic']=~df['topic'].isin(original_topics)
df['original_topic']=df['topic'].isin(original_topics)
df['uncategorized'] = np.where(df['topic']=="Uncategorized",True,False)
df['uncategorized'] = np.where(df['topic'].isna(),True,df['uncategorized'])
df.loc[df['uncategorized']==True,'is_other_topic']=False
df['segment_length'] = df['segment'].apply(lambda x: len(x.split(" ")))
df['segment_lessthan_30words'] = np.where(df['segment_length']<30,1,0)
df['segment_lessthan_15words'] = np.where(df['segment_length']<15,1,0)
#add metadata from pkl
df_pkl = pd.read_pickle(path+"/2023_scrape (1).pkl")
helpful_columns = ['Document','Received','Status','Name', 'Organization', 'Government Agency Type',
'Government Agency', 'Email', 'Address', 'Phone', 'has_attachments',
'attachment_count','head_start_commenter', 'congress_commenter']
df = add_meta_data(df,df_pkl,helpful_columns)
#save
cols_to_export = ['Document', 'text_chunk_id', 'text', 'segment_number', 'rolled_up', 'segment',
'topic', 'intent', 'all_topics_in_segment', 'all_topics_in_chunk', 'topic_origin',
'is_other_topic', 'original_topic', 'uncategorized', 'bill_numbers' ,
'segment_lessthan_30words', 'segment_lessthan_15words', 'has_attachments',
'attachment_count', 'Comment_counts', 'attachment_text_counts', 'to_translate',
'head_start_commenter', 'congress_commenter', 'Name', 'Organization',
'Government Agency Type', 'Government Agency', 'Email', 'Address', 'Phone',
'Received', 'Status']
df[cols_to_export].to_excel(final_output_path + "topic_results_{}.xlsx".format(TIMESTAMP))