-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext_attr.py
143 lines (107 loc) · 4 KB
/
text_attr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# -*- coding: utf-8 -*-
"""
Created on April 2024
@author: Amin
"""
# Import liberaries and functions
from data import load_data
import pandas as pd
import numpy as np
from multiprocessing import Pool
from functools import partial
import os
import gc
from time import strftime, gmtime
import spacy
nlp = spacy.load('en_core_web_sm')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from readability import Readability
# multiprocessing pandas apply function
def parallelize(data, func, num_of_processes=8):
data_split = np.array_split(data, num_of_processes)
pool = Pool(num_of_processes)
data = pd.concat(pool.map(func, data_split))
pool.close()
pool.join()
return data
def run_on_subset(func, data_subset):
return data_subset.apply(func)
def parallelize_on_rows(data, func, num_of_processes=8):
return parallelize(data, partial(run_on_subset, func), num_of_processes)
print(f"{strftime('%D %H:%M', gmtime())} | <<< START >>> \n")
# Import data
print(f"{strftime('%D %H:%M', gmtime())} | Loading data ...\n")
All_RFs = load_data("Data/RFs_all2.csv")
# RF_df = pd.read_csv("Data\clean_docs_3.csv")
# t2v_df = pd.read_csv("Top2Vec\T2V_df_H95.csv").set_index("index").drop(columns=['Docs'])
# # Merge files
# topics_df = pd.concat([RF_df, t2v_df[['Topic', 'Score', 'Topic_H', 'Score_H']]], axis=1)
# All_RFs = pd.merge(
# left=topics_df.drop(columns=['cleaned_txt', 'category', 'Industry']),
# right=All_RFs,
# left_on=["rf_seq", "CIK", "report_dt", "filing_dt"],
# right_on=['Unnamed: 0', 'CIK', 'report_dt', 'filing_dt'],
# how='left'
# ).drop(columns=['Unnamed: 0'])
All_RFs = All_RFs.sample(1000)
All_RFs['rf_length'] = All_RFs['Item 1A'].map(lambda x: len(x.split()))
print(f"{strftime('%D %H:%M', gmtime())} | Named entities in risk factors ...\n")
def NER_func(RF):
"""
This function determins named entity tags in the RF.
"""
Doc = nlp(RF)
NERs = [ent.label_ for ent in Doc.ents]
NE_joined = [" ".join(d) for d in NERs]
return NE_joined
NERs = parallelize_on_rows(All_RFs["Item 1A"], NER_func, num_of_processes=os.cpu_count())
All_RFs['NERs'] = NERs
del NERs
gc.collect()
## Time orientation
print(f"{strftime('%D %H:%M', gmtime())} | Computing time orientation \n")
def det_tense(RF):
"""
This function counts the number of verbs in past, present, and futur tenses.
"""
Doc = nlp(RF)
verbs = [tok.tag_ for tok in Doc]
Pa = len([v for v in verbs if v in ["VBD", "VBN"]]) #Past
Pr = len([v for v in verbs if v in ["VBP", "VBZ", "VBG"]]) #Present
Fu = len([v for v in verbs if v in ["MD", "VBC", "VBF"]]) #Future
return (Pa, Pr, Fu)
Time = parallelize_on_rows(All_RFs["Item 1A"], det_tense, num_of_processes=os.cpu_count()).to_list()
All_RFs[['Pa', 'Pr', 'Fu']] = Time
del Time
gc.collect()
## Sentiment
print(f"{strftime('%D %H:%M', gmtime())} | Computing sentiment \n")
sid = SentimentIntensityAnalyzer()
def sent_func(RF):
return sid.polarity_scores(RF)['compound']
Sentiment = parallelize_on_rows(All_RFs["Item 1A"], sent_func, num_of_processes=os.cpu_count())
All_RFs['Sentiment'] = Sentiment
del Sentiment
gc.collect()
## Readability
print(f"{strftime('%D %H:%M', gmtime())} | Computing readability \n")
def read_func(doc):
try:
return Readability(doc).gunning_fog().score
except:
return None
# measure readability per topic
# FOG_df = All_RFs.groupby(["CIK", "report_dt", "filing_dt", "Topic_H"])['Item 1A']\
# .agg(lambda x: " ".join(x)).reset_index()
FOG = parallelize_on_rows(All_RFs["Item 1A"], read_func, num_of_processes=os.cpu_count())
All_RFs['FOG'] = FOG
# All_RFs = pd.merge(
# left=All_RFs.drop(columns="Item 1A"),
# right=FOG_df.drop(columns="Item 1A"),
# on=["CIK", "report_dt", "filing_dt", 'Topic_H'],
# how='left'
# )
All_RFs.drop(columns=['Item 1A'], inplace=True)
print(f"{strftime('%D %H:%M', gmtime())} | Saving output to disk | \n")
All_RFs.to_csv('Data/RF_text_attr2.csv', index=False)
print(f"{strftime('%D %H:%M', gmtime())} | >>> END <<< \n")