-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean_docs.py
85 lines (59 loc) · 2.89 KB
/
clean_docs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
# -*- coding: utf-8 -*-
"""
Created on 2 November 2022
@author: Amin
"""
# Import libraries and functions
import pandas as pd
from data import nlp_clean, bigram
import argparse
from time import strftime, gmtime
import spacy
import re
"""
=============================================================================
Clean Risk Factors and implement bigram transformation.
To run:
>>> python clean_docs.py --RF_df Data/W2V_train.csv
=============================================================================
"""
parser = argparse.ArgumentParser(description='RFs prepration')
parser.add_argument('--RF_df', type=str, default='Data/RFs_all.csv', help='directory of dataframe containing risk factors')
parser.add_argument('--clean_docs', type=str, default="Data/W2V_train_3.csv", help='directory to save cleaned documents')
parser.add_argument('--Qup', type=float, default=1, help='Upper quantile to filter too long docs (risk factors)')
parser.add_argument('--Qlow', type=float, default=0.05, help='Lower quantile to filter too short docs (risk factors)')
parser.add_argument('--min_count', type=float, default=0.0005, help='min count of bigrams (ratio of number of RFs)')
parser.add_argument('--spacy', type=str, default="en_core_web_sm", help='spaCy model to be loaded')
parser.add_argument('--n_jobs', type=int, default=-1, help='Number of processors to process texts')
args = parser.parse_args()
print(args, "\n")
nlp = spacy.load(args.spacy)
print(f"{strftime('%D %H:%M', gmtime())} | <<< START >>> \n")
# Load raw text data
print(f"{strftime('%D %H:%M', gmtime())} | Loading data ...\n")
RF_df = pd.read_csv(args.RF_df).dropna()
# Replace some unwanted patterns in the RFs
pattern = r"(table\s+of\s+content[s]?)|((item\W*)?1a)|(risk\s+factor[s]?)"
RF_df["Item 1A"] = RF_df["Item 1A"].str.replace(pattern, " ", case=False, regex=True)
RFs = RF_df['Item 1A'].tolist()
print(f"{strftime('%D %H:%M', gmtime())} | Cleaning risk factor docs ...\n")
corpus = nlp.pipe(RFs, n_process=args.n_jobs)
clean_corpus = [list(d) for d in (nlp_clean(doc, lemma=True) for doc in corpus)]
print(f"{strftime('%D %H:%M', gmtime())} | Bigram transformation ...\n")
def token(text):
return text
transformed_sents = bigram(
raw_data = clean_corpus,
tokenizer=token,
min_cnt = args.min_count
)
cleaned_data = RF_df.drop(columns=["Item 1A"]).copy()
cleaned_data.loc[:, 'cleaned_txt'] = [" ".join(d) for d in transformed_sents]
# word_cnt = cleaned_data['cleaned_txt'].astype('str').map(lambda x: len(x.split()))
# Qup = int(word_cnt.quantile(q=args.Qup))
# Qlow = int(word_cnt.quantile(q=args.Qlow))
# print(f"Documents with less than {Qlow} and more than {Qup} words are dropped.\n")
# filtered_rf_df = cleaned_data[(word_cnt > Qlow) & (word_cnt < Qup)]
print(f"{strftime('%D %H:%M', gmtime())} | Saving cleaned documents ...\n")
cleaned_data.to_csv(args.clean_docs)
print(f"{strftime('%D %H:%M', gmtime())} | >>> END <<< \n")