Skip to content

Commit 55fd101

Browse files
report added, img added, minor fixes
2 parents c38ad45 + 7efa6e0 commit 55fd101

9 files changed

+355
-142
lines changed

StringSearching.ipynb

+288-94
Large diffs are not rendered by default.

algorithm/boyer_moore_horspool.py

+18-13
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ class BoyerMooreHorspool(Algorithm):
88
def __init__(self, reference):
99
# reference = reference.translate(reference.maketrans('', '', ascii_letters))
1010
self.reference = reference
11+
self.n_operations = 0
12+
1113

1214
@property
1315
def name(self):
@@ -17,30 +19,33 @@ def set_candidate(self, candidate, **params):
1719
self.candidate = candidate
1820

1921
def set_skip_table(self):
20-
len_reference = len(self.reference)
21-
len_candidate = len(self.candidate)
22-
if len_candidate > len_reference:
23-
print("Error: len candidate > len references {} > {}.".format(len_candidate, len_reference))
22+
len_ref = len(self.reference)
23+
len_can = len(self.candidate)
24+
if len_can > len_ref:
25+
print("Error: len candidate > len references {} > {}." \
26+
.format(len_can, len_ref))
2427
return -1
2528

26-
self.table_skip = defaultdict(lambda: len_candidate)
29+
self.table_skip = defaultdict(lambda: len_can)
30+
31+
for offset in range(len_can - 1):
32+
self.table_skip[ord(self.candidate[offset])] = len_can - offset - 1
2733

28-
for offset in range(len_candidate - 1):
29-
self.table_skip[ord(self.candidate[offset])] = len_candidate - offset - 1
30-
3134
def search(self, multiple_search=False) -> list:
3235
self.set_skip_table()
3336

34-
len_reference = len(self.reference)
35-
len_candidate = len(self.candidate)
37+
len_ref = len(self.reference)
38+
len_can = len(self.candidate)
3639

3740
offset_lst = []
38-
offset = len_candidate - 1
41+
offset = len_can - 1
3942

40-
while offset < len_reference:
41-
j = len_candidate - 1
43+
while offset < len_ref:
44+
j = len_can - 1
4245
i = offset
46+
self.n_operations += 1
4347
while j >= 0 and self.reference[i] == self.candidate[j]:
48+
self.n_operations += 1
4449
j -= 1
4550
i -= 1
4651
if j == -1:

algorithm/knuth_morris_pratt.py

+27-26
Original file line numberDiff line numberDiff line change
@@ -8,73 +8,74 @@ class KnuthMorrisPratt(Algorithm):
88
def __init__(self, reference):
99
# reference = reference.translate(reference.maketrans('', '', ascii_letters))
1010
self.reference = reference
11+
self.n_operations = 0
12+
1113

1214
@property
1315
def name(self):
1416
return 'Knuth-Morris-Pratt'
15-
17+
1618
def set_candidate(self, candidate, **params):
1719
self.candidate = candidate
1820

1921
def search(self, multiple_search=False) -> list:
2022
"""Return the lowest index of T at which substring P begins (or else -1)."""
21-
len_reference = len(self.reference)
22-
len_candidate = len(self.candidate)
23-
if len_candidate > len_reference:
24-
print("Error: len candidate > len references {} > {}.".format(len_candidate, len_reference))
23+
len_ref = len(self.reference)
24+
len_can = len(self.candidate)
25+
26+
if len_can > len_ref:
27+
print("Error: len candidate > len references {} > {}.".format(len_can, len_ref))
2528
return -1
26-
# create lps[] that will hold the longest prefix suffix
29+
# create lps[] that will hold the longest prefix suffix
2730
# values for pattern
28-
self.lps = [0] * len_candidate
31+
self.lps = [0] * len_can
2932
j = 0 # index for pat[]
3033

31-
# Preprocess the pattern (calculate lps[] array)
34+
# Preprocess the pattern
3235
self.computeLPSArray()
33-
3436
offset_lst = []
35-
37+
3638
i = 0 # index for txt[]
37-
while i < len_reference:
39+
while i < len_ref:
40+
41+
self.n_operations += 1
3842
if self.candidate[j] == self.reference[i]:
3943
i += 1
4044
j += 1
4145

42-
if j == len_candidate:
46+
47+
if j == len_can:
48+
self.n_operations += 1
4349
offset_lst.append(i - j)
4450
j = self.lps[j - 1]
45-
if (not multiple_search):
51+
if (not multiple_search):
4652
return offset_lst
4753
# mismatch after j matches
48-
elif i < len_reference and self.candidate[j] != self.reference[i]:
49-
# Do not match lps[0..lps[j-1]] characters,
50-
# they will match anyway
54+
elif i < len_ref and self.candidate[j] != self.reference[i]:
55+
self.n_operations += 1
5156
if j != 0:
5257
j = self.lps[j - 1]
5358
else:
5459
i += 1
5560
return offset_lst
56-
61+
5762
def computeLPSArray(self):
58-
len_candidate = len(self.candidate)
63+
len_can = len(self.candidate)
5964

60-
length = 0 # length of the previous longest prefix suffix
65+
length = 0
6166

62-
self.lps[0] = 0 # lps[0] is always 0
67+
self.lps[0] = 0
6368
i = 1
6469

65-
# the loop calculates lps[i] for i = 1 to M-1
66-
while i < len_candidate:
70+
while i < len_can:
71+
self.n_operations += 1
6772
if self.candidate[i] == self.candidate[length]:
6873
length += 1
6974
self.lps[i] = length
7075
i += 1
7176
else:
7277
if length != 0:
73-
# This is tricky. Consier the example AAACAAAA
74-
# and i = 7
7578
length = self.lps[length - 1]
76-
77-
# Also, note that we do not increment i here
7879
else:
7980
self.lps[i] = 0
8081
i += 1

algorithm/naive.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ def __init__(self, reference):
88
self.reference = reference
99
self.n_operations = 0
1010

11+
1112
@property
1213
def name(self):
1314
return 'Brute Force'
@@ -21,13 +22,13 @@ def set_candidate(self, candidate, **params):
2122

2223
def search(self, multiple_search=False) -> list:
2324
offset_lst = []
24-
len_reference = len(self.reference)
25-
len_candidate = len(self.candidate)
26-
for offset in range(len_reference - len_candidate):
25+
len_ref = len(self.reference)
26+
len_can = len(self.candidate)
27+
for offset in range(len_ref -len_can + 1):
2728
i = 0
2829
while self.reference[offset + i] == self.candidate[i]:
2930
self.n_operations += 1
30-
if (i + 1) == len_candidate:
31+
if (i + 1) == len_can:
3132
offset_lst.append(offset)
3233
if not multiple_search:
3334
return offset_lst

algorithm/rabin_karp.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,11 @@ class RabinKarp(Algorithm):
77

88
def __init__(self, reference, hash_function=hash):
99
self.reference = reference
10+
self.n_operations = 0
1011
self.hash_function = hash_function
1112
self.n_operations = 0
1213

14+
1315
@property
1416
def name(self):
1517
return 'Rabin-Karp'
@@ -41,23 +43,26 @@ def search(self, multiple_search=False)-> list:
4143

4244
for i in range(len_ref-len_can+1):
4345

46+
self.n_operations += 1
47+
4448
if self.c == self.r:
4549
match = True
4650
for j in range(len_can):
51+
self.n_operations += 1
4752
if self.candidate[j] != self.reference[i+j]:
4853
match = False
4954
break
5055
if match:
5156
offset_lst.append(i)
5257
if not multiple_search:
53-
return offset
58+
return [i]
5459

5560
if i < len_ref - len_can:
5661
self.r = (self.r - self.h * ord(self.reference[i]))
5762
self.r %= self.q
5863
self.r = (self.r * self.d + ord(self.reference[i+len_can]))
5964
self.r %= self.q
60-
self.r = (self.r + self.q) % self.q
65+
# self.r = (self.r + self.q) % self.q
6166

6267
return offset_lst
6368

@@ -72,6 +77,7 @@ def search_pyhash(self, multiple_search=False) -> list:
7277
for offset in range(int(np.ceil(len_ref / len_can))):
7378
reference_hash = hash(self.reference[offset:(offset + len_can)])
7479
if reference_hash == candidate_hash:
80+
self.n_operations += 1
7581
i = 0
7682
while self.reference[offset + i] == self.candidate[i]:
7783
self.n_operations += 1

img/bad.png

93.4 KB
Loading

img/good.png

146 KB
Loading

report.pdf

320 KB
Binary file not shown.

utils/tools.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ def generate_stat(algorithms,
2626
gen_string,
2727
dictionary, reference_len, candidate_len,
2828
n_observations,
29+
multiple_search=True,
2930
**params):
3031

3132
# sanity checks
@@ -41,6 +42,7 @@ def generate_stat(algorithms,
4142
'preprocessing': [],
4243
'execution': [],
4344
'observation': [],
45+
'n_operations': [],
4446
'indexes': []
4547
}
4648

@@ -59,7 +61,7 @@ def generate_stat(algorithms,
5961
preprocess = datetime.now() - start_time
6062

6163
start_time = datetime.now()
62-
indexes = alg.search(multiple_search=True)
64+
indexes = alg.search(multiple_search=multiple_search)
6365
execution = datetime.now() - start_time
6466

6567
info_dct['algorithm'] += [alg.name]
@@ -68,6 +70,7 @@ def generate_stat(algorithms,
6870
info_dct['preprocessing'] += [preprocess.total_seconds()]
6971
info_dct['execution'] += [execution.total_seconds()]
7072
info_dct['observation'] += [observation]
73+
info_dct['n_operations'] += [alg.n_operations]
7174
info_dct['indexes'] += [str(indexes)]
7275

7376
return pd.DataFrame.from_dict(info_dct)
@@ -79,6 +82,7 @@ def generate_stat_for_benchmarks(algorithms,
7982
files_t,
8083
path_to_benchmarks,
8184
n_observations=1,
85+
multiple_search=True,
8286
**params):
8387

8488
assert len(files_w) == len(files_t)
@@ -94,6 +98,7 @@ def generate_stat_for_benchmarks(algorithms,
9498
'preprocessing': [],
9599
'execution': [],
96100
'observation': [],
101+
'n_operations': [],
97102
'indexes': []
98103
}
99104
for observation in range(n_observations):
@@ -117,7 +122,7 @@ def generate_stat_for_benchmarks(algorithms,
117122
preprocess = datetime.now() - start_time
118123

119124
start_time = datetime.now()
120-
indexes = alg.search(multiple_search=True)
125+
indexes = alg.search(multiple_search=multiple_search)
121126
execution = datetime.now() - start_time
122127

123128
info_dct['algorithm'] += [alg.name]
@@ -127,6 +132,7 @@ def generate_stat_for_benchmarks(algorithms,
127132
info_dct['preprocessing'] += [preprocess.total_seconds()]
128133
info_dct['execution'] += [execution.total_seconds()]
129134
info_dct['observation'] += [observation]
135+
info_dct['n_operations'] += [alg.n_operations]
130136
info_dct['indexes'] += [str(indexes)]
131137
return pd.DataFrame.from_dict(info_dct)
132138

@@ -148,7 +154,7 @@ def get_plots(stat_df,
148154
oy - 2 * oy_std,
149155
oy + 2 * oy_std,
150156
color=p[0].get_color(), alpha=0.3,
151-
label='Confidence interval of 68% ' + alg)
157+
label='Confidence interval of 68% '+alg)
152158
plt.title(title)
153159
plt.xlabel('Reference string length')
154160
plt.ylabel('Time, seconds ')

0 commit comments

Comments
 (0)