-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_processing.py
247 lines (214 loc) · 8.82 KB
/
data_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
import re
from tokenize import tokenize
from io import BytesIO
import pickle
def format_solution(solution):
'''
Problem with this approach, when there are mulitple indent shift towards the left, this approach fails
and for some reason it still has \n still left in the solutions.
'''
# remove empty lines first
tmp_solution = []
for sentence in solution:
if sentence == '\n' or re.match(r' +\n|\t+\n', sentence):
continue
else:
# remove trailing spaces
# check if trailing spaces
trailing_spaces = re.match(r'(^.*?)[ ]+\n', sentence)
if trailing_spaces:
sentence = re.sub(r'(^.*?)[ ]+\n', r'\1\n', sentence)
tmp_solution.append(sentence)
solution = tmp_solution
def find_indent_value(starting_spaces_, indent_scheme_):
sum_of_indents = 0
for i, j in enumerate(indent_scheme_):
sum_of_indents += j
if sum_of_indents == starting_spaces_:
break
total_indents_ = i
if total_indents_ < 0:
raise Exception
return total_indents_
pruned_solution = []
indent_scheme = []
check_indentation_flag = True
for sentence_number,sentence in enumerate(solution):
starting_spaces = len(re.match(r'^([ ]*).*?', sentence)[1])
if check_indentation_flag:
possible_indent = starting_spaces - sum(indent_scheme)
if possible_indent > 0 or sentence_number == 0: # checking sentence number to get the first indentation which cud be 0
indent_scheme.append(possible_indent)
check_indentation_flag = False
if re.match(r'.*:\n', sentence):
check_indentation_flag = True
total_indents = find_indent_value(starting_spaces, indent_scheme)
indent = total_indents * '\t'
pruned_solution.append(re.sub(r'([ ]*)(.*?)\n', indent + r'\2\n', sentence))
return pruned_solution
def getDataAnalysis():
#########################
# checking number of unique questions
unique_questions = {}
question = re.compile(r'^#[ ]?[0-9]*[.]?[ ]?(.*)')
f = open("data/english_python_data_pruned.txt", "r")
question_regex = re
for line in f:
match_object = question.match(line)
if match_object and match_object[1] not in unique_questions:
unique_questions[match_object[1]] = "aka"
print(len(unique_questions))
# for i in unique_questions:
# print(i)
f.close()
########################
# checking unique questions answer pairs
unique_questions_with_different_solution = {} # used to check for unique answers
question_answer_pair = {} # used to save solutions for unique questions
f = open("data/english_python_data_pruned.txt", "r")
solution = []
for i, line in enumerate(f):
match_object = question.match(line)
if match_object:
if len(solution) == 0:
prev_match = match_object[1]
continue
else:
solution_combined = " ".join(solution).replace('\n', "").replace('\t', "").replace(" ", "")
if prev_match not in unique_questions_with_different_solution:
unique_questions_with_different_solution[prev_match] = [solution_combined]
question_answer_pair[prev_match] = [solution]
else:
flag = 0
for j in unique_questions_with_different_solution[prev_match]:
if j != solution_combined:
flag = 1
if flag:
unique_questions_with_different_solution[prev_match].append(solution_combined)
question_answer_pair[prev_match].append(solution)
solution = []
prev_match = match_object[1]
else:
solution.append(line)
sum_ = 0
for i in unique_questions_with_different_solution:
sum_ += len(unique_questions_with_different_solution[i])
print(f' total unique questions and solutions pairs are {sum_}')
f.close()
############################
# Extracting question answers pairs using Regex
# f = open("english_python_data_pruned.txt","r")
# entire_file = f.read()
# question_and_answer = r'^#[ ]?[0-9]*[.]?[ ]?(.*?)$([\S\s]*)-------------'
# pairs = re.findall(question_and_answer,entire_file,re.MULTILINE)
#
###########################
# Formatting solutions
# 1. convert 4 space or 3 space indentation to \t
# 2. removing trailing spaces
# 3. remove lines with only \n or only spaces
# 4. TODO: Removing spaces around operators like ==, &&
keyword_analysis = {}
questions_list = []
answers_list = []
for i in question_answer_pair:
for j in question_answer_pair[i]:
questions_list.append(i)
formatted_solution = format_solution(j)
answers_list.append(formatted_solution)
k = "".join(formatted_solution)
try:
a = list(tokenize(BytesIO(k.encode('utf-8')).readline))
for i__ in a[1:-1]:
if i__[1] not in keyword_analysis:
keyword_analysis[i__[1]] = 1
else:
keyword_analysis[i__[1]] += 1
except Exception:
print("Error in tokenization")
print('Total len of the keyword dictionary is ', len(keyword_analysis))
print(keyword_analysis)
# Note:
# A lot of keywords are names of variable of strings or numbers
# I might have to character wise input dictionary but that would make the problem more
# difficult to solve for the network
# Will try BPE?
def getData(path):
'''
Function to return data
:return: two lists of questions and their formatted answers
'''
question = re.compile(r'^#[ ]?[0-9]*[.]?[ ]?(.*)')
unique_questions_with_different_solution = {} # used to check for unique answers
question_answer_pair = {} # used to save solutions for unique questions
f = open(path, "r")
solution = []
for i, line in enumerate(f):
match_object = question.match(line)
if match_object:
if len(solution) == 0:
prev_match = match_object[1]
continue
else:
solution_combined = " ".join(solution).replace('\n', "").replace('\t', "").replace(" ", "")
if prev_match not in unique_questions_with_different_solution:
unique_questions_with_different_solution[prev_match] = [solution_combined]
question_answer_pair[prev_match] = [solution]
else:
flag = 0
for j in unique_questions_with_different_solution[prev_match]:
if j != solution_combined:
flag = 1
if flag:
unique_questions_with_different_solution[prev_match].append(solution_combined)
question_answer_pair[prev_match].append(solution)
solution = []
prev_match = match_object[1]
else:
solution.append(line)
questions_list = []
answers_list = []
for i in question_answer_pair:
for j in question_answer_pair[i]:
questions_list.append(i)
formatted_solution = format_solution(j)
answers_list.append("".join(formatted_solution))
return questions_list,answers_list
def getTokenizer(python_code):
'''
Function that returns tokenized python code
:return: tokenized code
'''
tokens = []
try:
a = list(tokenize(BytesIO(python_code.encode('utf-8')).readline))
indents = 0
last_token = a[0]
for i__ in a[1:-1]:
if i__.exact_type == 56:
tokens.append("\n")
continue
if i__.exact_type == 6: # Dedent
indents -= 1
if i__.exact_type == 5: # Indent
indents += 1
if last_token.exact_type == 4: # Newline
tokens.append(indents * '\t')
if i__.exact_type == 3:
if re.match(r'^f"', i__[1]):
string_tokens = ['f"'] + [k__ for k__ in i__[1][2:]]
elif re.match(r"^f'", i__[1]):
string_tokens = ["f'"] + [k__ for k__ in i__[1][2:]]
else:
string_tokens = [k__ for k__ in i__[1]]
tokens = tokens + string_tokens
elif i__.exact_type == 6 or i__.exact_type == 5:
pass
else:
tokens.append(i__[1])
last_token = i__
except Exception:
print("Error in tokenization")
return tokens
# getDataAnalysis()
# getData("data/english_python_data_pruned.txt")