-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscript.py
275 lines (240 loc) · 11.2 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
from pathlib import Path
import numpy as np
import pandas as pd
from numba import njit, jit
from difflib import SequenceMatcher
import time
from numba.typed import List
from operator import itemgetter
# normally I wouldn't write such redundant code but it's only ten files so eh
with open("data/sample.1", "rb") as file1, open("data/sample.2", "rb") as file2, open("data/sample.3", "rb") as file3, open("data/sample.4", "rb") as file4, open("data/sample.5", "rb") as file5, open("data/sample.6", "rb") as file6, open("data/sample.7", "rb") as file7, open("data/sample.8", "rb") as file8, open("data/sample.9", "rb") as file9, open("data/sample.10", "rb") as file10:
file_1 = file1.read()
file_2 = file2.read()
file_3 = file3.read()
file_4 = file4.read()
file_5 = file5.read()
file_6 = file6.read()
file_7 = file7.read()
file_8 = file8.read()
file_9 = file9.read()
file_10 = file10.read()
# we can see that the length of encoded array matches exactly the file size. E.g, sample.1 is 17.408 kb
# that's exact same length of encoded_array_1 (17408)
encoded_array_1 = bytearray(file_1)
encoded_array_2 = bytearray(file_2)
encoded_array_3 = bytearray(file_3)
encoded_array_4 = bytearray(file_4)
encoded_array_5 = bytearray(file_5)
encoded_array_6 = bytearray(file_6)
encoded_array_7 = bytearray(file_7)
encoded_array_8 = bytearray(file_8)
encoded_array_9 = bytearray(file_9)
encoded_array_10 = bytearray(file_10)
encoded_arrays = [encoded_array_1, encoded_array_2, encoded_array_3, encoded_array_4,
encoded_array_5, encoded_array_6, encoded_array_7, encoded_array_8,
encoded_array_9, encoded_array_10
]
print("Size of sample.1 in bytes: ", Path('data/sample.1').stat().st_size)
for val, i in enumerate(encoded_arrays):
print("Length of sample." + str(val+1) + " byte array: ", len(i))
# helper function that pads all rows so they are same length as longest row
def pad_numpy_matrix(matrix):
l = [len(a) for a in matrix]
max_len = max(l)
# np.pad is much more space efficient than alternatives
c = np.asarray([np.pad(a, (0, max_len - len(a)), 'constant', constant_values=0) for a in matrix])
# c = np.asarray([np.hstack((a, np.zeros(max_len-a.shape[0], dtype = np.int8))) for a in matrix])
return c
# Solution using CPython difflib
# ~30s for bytes
def longest_Substring(s1,s2):
seq_match = SequenceMatcher(None,s1,s2)
match = seq_match.find_longest_match(0, len(s1), 0, len(s2))
# return the longest substring
if (match.size!=0):
return (s1[match.a: match.a + match.size])
else:
return ('Longest common sub-string not present')
def check_all(arrays):
# https://github.com/python/cpython/blob/0daf72194bd4e31de7f12020685bb39a14d6f45e/Lib/difflib.py#L305
array_copy = arrays.copy()
maxes = []
count = 0
# no need to compare the same two arrays twice, 1+2+3+...+10 = 45 comparisons only
for i in range(len(array_copy)):
a = array_copy[0]
del array_copy[0]
count += 1
next_count = 0
for j in range(len(array_copy)):
next_count += 1
add = count+next_count
longest = longest_Substring(a, array_copy[j])
maxes.append((count,(count+next_count),len(longest)))
return maxes
str_arrays = []
bytes_arrays = []
for i in encoded_arrays:
str_arrays.append(str(bytes(i)))
bytes_arrays.append(bytes(i))
answer_str = check_all(str_arrays)
str_pair = max(answer_str,key=itemgetter(2))[0:2]
str_max_length = max(answer_str,key=itemgetter(2))[2]
print("The longest common substring(str) length is: ", str_max_length)
print("The longest common substring(str) origin files are: ", str_pair, '\n')
start = time.time()
answer_bytes = check_all(bytes_arrays)
end = time.time()
bytes_pair = max(answer_bytes,key=itemgetter(2))[0:2]
bytes_max_length = max(answer_bytes,key=itemgetter(2))[2]
cpython_runtime = end-start
print("The longest common substring(bytes) length is: ", bytes_max_length)
print("The longest common substring(bytes) origin files are: ", bytes_pair, '\n')
print("CPython runtime (bytes): ", cpython_runtime)
# Testing CPython Solutions to show that a string of max length does actually exist. If so, then new solution only has to match CPython solution
s1 = np.array(encoded_arrays[1])
s2 = np.array(encoded_arrays[2])
# print(len(s1), len(s2))
x = 27648
seq_match = SequenceMatcher(None,s1,s2)
match = seq_match.find_longest_match(0, len(s1), 0, len(s2))
print("Is CPython solution correct? ", np.all(s1[match[0]:match[0]+x]==s2[match[1]:match[1]+x]))
# let's make everything a numpy array of arrays. Looping to append to
# np arrays is actually very inefficient so convert at end instead
def create_np_array(list_arrays):
allArrays = []
for i in list_arrays:
allArrays.append(np.array(i))
# test that everything went ok
#print(len(allArrays), len(allArrays[0])==len(allArrays[1]), type(allArrays[0]))
main_array = []
for i in range(256):
main_array.append([[], []])
# repeat this for each file (10x)
for val, i in enumerate(allArrays):
new_i = i
# add each possible array
for j in range(len(i)):
head = new_i[0]
# each array in 256 main arrays will have the original file in position 0 followed by actual substring content
# to_append = np.concatenate(([val], new_i))
a = len(new_i)
main_array[head][0].append([val+1, a, j])
main_array[head][1].append(new_i)
new_i = new_i[1:]
print('complete enumeration')
return(main_array)
def sort_matrix(main_array, global_max):
y = np.array(main_array[0])
lengths = y[:,1]
max_length = np.max(lengths)
# if the maximum length of all strings in this current matrix is less than current max substring, no need to bother. Skip this matrix
if max_length>global_max[2]:
# one of the most effective things by far. There's no need to assess strings whose original lengths < current max substring
# so let's remove them, and only sort the rows that are larger. Easy matrix transformations
use_rows = lengths > global_max[2]
temp_final = np.array(main_array[1])[use_rows]
keys_final = y[use_rows]
#b = time.time()
#print('make np: ', b-a)
# make all arrays same size. Pad with 0's so sortable
temp_padded = pad_numpy_matrix(temp_final)
#c = time.time()
#print('pad rows: ', c-a)
# np.lexsort each matrix, returns the final order of rows for sort
# by far the biggest eater of time here. ~80-95%. If we can cut this down, drastic improvement
sort_order = np.lexsort(np.fliplr(temp_padded).T)
#sort_order = -temp_padded[:,:].argsort(axis=0)[0]
#d = time.time()
#print('sort: ', d-a)
# we also want keys to follow new order. This way we can sort rows independent of position, size (keys)
sorted_keys = keys_final[sort_order]
temp_sorted = temp_padded[sort_order]
#e = time.time()
#print('append: ', e-a)
return [sorted_keys, temp_sorted]
else:
return None
def max_suffix(array, global_max, debug=False):
# [[meta], [matrix]]
# global_max = (x, y, length) . Both original file locations and length of current max substring
meta = array[0]
matrix = array[1]
new_max = global_max
# i is each 256 array in array: [[],[]]
# if there isn't more than one substring beginning with that number no need to check
if len(meta) >1:
for j in range(1, len(matrix)):
a = j
b = j+1
# avoid checking string if it's original length less than current stored max value
if meta[-a][1] > new_max[2]:
if debug:
print("a and b", a, b)
print("meta", meta[-a][0], meta[-b][0])
# we want the first row working backwards that's not from same origin file
if meta[-a][0] != meta[-b][0]:
# np.ndarray
# b < a in length
a_array = np.array(matrix[-a])
b_array = np.array(matrix[-b])
if debug:
print("a", a_array)
print("b", b_array)
c = a_array==b_array
#non_zero_array = np.nonzero(c == False)[0]
if debug:
print(c) #non_zero_array)
# this part threw me off so much. We have to treat perfect matches and matches separately. Perfect matches do not return anything for c==False
# So nothing will be returned, and skips perfect matches. Very bad. np.all() assesses perfect matches, and if there is one, use the shorter's length
if np.sum(c) > 0:
if np.all(c):
LCS_length = meta[-b][1]
else:
LCS_length = np.nonzero(c == False)[0][0]
if debug:
print(LCS_length)
if LCS_length > new_max[2]:
# we need the original string size precisely here. If two substrings are perfect match, we don't want all appended 0's there as well
# only append actual length of substring added, not padded length
new_max = (meta[-a][0], meta[-b][0], LCS_length, (meta[-a][2], meta[-b][2]))
#print("updated", new_max)
else:
pass
return new_max
# Main function that just loops through all separate original strings
# ~22s runtime. Around 25% improvement over CPython
def loop_matrix(matrix, debug=False):
LCS_matrix = create_np_array(matrix)
global_max = (0,0,0,0,0)
#start = time.time()
for i in range(len(LCS_matrix)):
# print("Completed search through all sub-sequences beginning with: ", i, global_max)
if len(LCS_matrix[i][0]) > 1:
temp = sort_matrix(LCS_matrix[i], global_max)
else:
temp = None
#end = time.time()
#print("finished sorting in: ", end-start)
if temp:
global_max = max_suffix(temp, global_max, debug)
#end = time.time()
#print("finished group: ", i, " in ", end-start)
return global_max
# I spent hours debugging a simple mistake... This is why testing is important. If you'd like the test, simply set debug=True. Plenty of debugging comments as well
test = [[1,3,3,4,5,6,7],
[3,1,4,2,7,3,4,5,6,7],
[1,5,6,7,6,7,4,2,7,3,4,3,1,4,2,7,3,4,5,6,7],
[1,4,8,7,8,9,10],
[3,6,1,4,2,7],
[9,8,4,0,3,4,5]]
start = time.time()
lcs_ = loop_matrix(encoded_arrays, False)
end = time.time()
my_runtime = end-start
print("The longest common substring(bytes) length is: ", lcs_[2])
print("The longest common substring(bytes) origin files are: ", lcs_[0:2])
print("Strings begin in positions: ", lcs_[3])
# Yup it works
print("Is final answer correct? ", lcs_[2]==bytes_max_length)
print("Is my version faster? ", my_runtime<cpython_runtime, my_runtime)