-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathword_count_spacy.py
72 lines (50 loc) · 1.88 KB
/
word_count_spacy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pandas as pd
import spacy
import os
nlp = spacy.load("en_core_web_sm")
def word_count(input_text):
"""
Count frequency of word by type
"""
doc = nlp(input_text)
rows_column = ['text', 'base_form', 'coarse_grained', 'fine_grained', 'dependency']
rows_list = []
for token in doc:
rows_list.append([token.text.lower(), token.lemma_, token.pos_, token.tag_, token.dep_])
df = pd.DataFrame(rows_list, columns=rows_column)
print(df['coarse_grained'].unique())
df_sub = df.loc[df['coarse_grained'] == 'PROPN', ['text']]
df_out = df_sub['text'].value_counts(dropna=True, sort=True).rename_axis('text').reset_index(name='counts')
return df_out
def word_cloud(df):
"""
Word cloud plot from frequency table
"""
d = dict(zip(df['text'], df['counts']))
wordcloud = WordCloud()
wordcloud.generate_from_frequencies(frequencies = d)
# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.margins(x=0, y=0)
#plt.show()
plt.savefig('propn.png', bbox_inches='tight')
return plt
def main():
input_file = '../github/colectica_api/RCNIC_covid_lable.csv'
df_input = pd.read_csv(input_file, sep='\t')
df_qg = df_input.loc[df_input['QuestionGroupLabel'] == 'Income', ['QuestionLiteral']]
question_text = df_qg['QuestionLiteral'].str.cat(sep=' | ')
print(len(question_text))
nlp.max_length = len(question_text) + 100
output_dir = 'work_count'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
df_word_count = word_count(question_text)
df_word_count.head(100).to_csv(os.path.join(output_dir, 'propn.csv'), index=False)
plt = word_cloud(df_word_count)
plt.savefig(os.path.join(output_dir, 'propn.pdf'))
if __name__ == '__main__':
main()