-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata.py
71 lines (54 loc) · 2.26 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# -*- coding: utf-8 -*-
"""
Created on Thu May 26 2022
@author: Amin
"""
# Import liberaries
import pandas as pd
from time import gmtime, strftime
from gensim.models import Phrases, phrases
def load_data(path, low_bnd=0.1, up_bnd=1.0, col="Itam 1A", index=None):
"""
Load the Risk Factors .csv file and apply filters on:
> low_bound: min length of the doc in column "Itam 1A"
> up_bound: max length of the doc in column "Itam 1A"
"""
print("Loading data ...\n")
RF_df = pd.read_csv(filepath_or_buffer=path, index_col=index).dropna()
word_cnt = RF_df[col].astype('str').map(lambda x: len(x.split()))
Qup = int(word_cnt.quantile(q=up_bnd))
Qlow = int(word_cnt.quantile(q=low_bnd)
)
print(f"Documents with less than {Qlow} and more than {Qup} words are dropped.\n")
filtered_rf_df = RF_df[(word_cnt>Qlow) & (word_cnt<Qup)]
return filtered_rf_df
def nlp_clean(doc, lemma=False):
# To remove stop words, punctuations, numbers, etc.
mask = lambda t: (t.is_alpha or t.pos==96) and not t.is_stop and t.shape_.lower()!='x'
if lemma:
tokens = (tok.lemma_.lower() for tok in filter(mask, doc))
else:
tokens = (tok.text.lower() for tok in filter(mask, doc))
return tokens
def bigram(raw_data, tokenizer, min_cnt=0.001):
"""
Transform texts to bigrams:
> raw_data: list of "Itam 1A" column
> min_cnt: min count of bigrams (ratio of number of RFs)
> tokenizer: function to tokenize raw_data
"""
# Memory friendly iterator
class MyCorpus:
"""An iterator that yields sentences (lists of str)."""
def __iter__(self):
for doc in raw_data:
yield tokenizer(doc)
sentences = MyCorpus()
print(f"{strftime('%D %H:%M', gmtime())} | Detecting bigrams in the corpus using a memory-friendly iterator ...\n")
# Train a bigram detector.
bigram_transformer = Phrases(sentences, min_count=min_cnt*len(raw_data), connector_words=phrases.ENGLISH_CONNECTOR_WORDS)
bigram_freezed = bigram_transformer.freeze()
print(f"{strftime('%D %H:%M', gmtime())} | Creating transformed sentences ...\n")
# Apply the trained MWE detector to the corpus
transformed_sents = bigram_freezed[sentences]
return transformed_sents