-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
73 lines (55 loc) · 1.57 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from collections import Counter
from os import walk
from config import CONFIG
'''
preprocess : string(file name) -> dict(for char), dict(for seq)
dict : set of {(string) : float}
Example : preprocess()
Eat the name of textfile(as a string) and calculate the frequency
of each aphabet and every sequence.
'''
def Preprocess(filename = 'text.txt'):
f = open(filename, 'r')
chr_list = []
lines = f.read()
line_lst = list(lines.lower())
for c in line_lst:
if c.isalpha():
chr_list.append(c)
rel_list = []
for i in range(len(chr_list)-1):
rel_list.append(chr_list[i]+chr_list[i+1])
alp_freq = Counter(chr_list)
seq_freq = Counter(rel_list)
return alp_freq, seq_freq, len(chr_list), len(rel_list)
alp_freq, seq_freq = dict(), dict()
for c in CONFIG['alphabet_string']:
alp_freq[c] = 0
for cc in CONFIG['alphabet_string']:
seq_freq[c+cc] = 0
total_alp, total_seq = 0, 0
f = []
for (dirpath, dirnames, filenames) in walk('COCA/'):
f.extend(filenames)
for name in filenames:
temp_alp, temp_seq, num_alp, num_seq = Preprocess('COCA/'+ name)
total_alp += num_alp
total_seq += num_seq
for i in temp_alp:
alp_freq[i] += temp_alp[i]
for i in temp_seq:
seq_freq[i] += temp_seq[i]
for i in alp_freq:
alp_freq[i] /= total_alp
for i in seq_freq:
seq_freq[i] /= total_seq
fr = open('corpus_data.py','w')
fr.write('Alpha_freq = { \n')
for c in alp_freq:
fr.write('\t\'' + str(c) + '\': '+ str(alp_freq[c]) + ', \n')
fr.write('} \n')
fr.write('Sequence_freq = { \n')
for c in seq_freq:
fr.write('\'' + str(c) + '\': '+ str(seq_freq[c]) + ', ')
fr.write('} \n')
fr.close()