Skip to content

Commit 53c26c4

Browse files
initial commit
1 parent bd33c37 commit 53c26c4

File tree

6 files changed

+1069
-0
lines changed

6 files changed

+1069
-0
lines changed

Diff for: StringSearching.ipynb

+243
Large diffs are not rendered by default.

Diff for: algorithm/base.py

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import abc as ABC
2+
3+
from abc import ABC, abstractmethod, abstractproperty
4+
5+
class Algorithm(ABC):
6+
7+
@abstractmethod
8+
def set_candidate(self):
9+
pass
10+
11+
@abstractmethod
12+
def search(self):
13+
pass
14+
15+
@abstractproperty
16+
def name(self):
17+
pass

Diff for: algorithm/naive.py

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
from algorithm.base import Algorithm
2+
3+
4+
class BruteForce(Algorithm):
5+
"""Naive string search algorithm"""
6+
7+
def __init__(self, reference):
8+
self.reference = reference
9+
10+
@property
11+
def name(self):
12+
return 'Brute Force'
13+
14+
15+
def set_candidate(self, candidate, **params):
16+
# some preprocessing
17+
# ...
18+
self.candidate = candidate
19+
20+
21+
def search(self, multiple_search=False) -> list:
22+
offset_lst = []
23+
len_reference = len(self.reference)
24+
len_candidate = len(self.candidate)
25+
for offset in range(len_reference - len_candidate):
26+
i = 0
27+
while self.reference[offset + i] == self.candidate[i]:
28+
if (i + 1) == len_candidate:
29+
offset_lst.append(offset)
30+
if not multiple_search:
31+
return offset_lst
32+
else:
33+
break
34+
i += 1
35+
return offset_lst

Diff for: algorithm/rabin_karp.py

+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import numpy as np
2+
from algorithm.base import Algorithm
3+
4+
5+
class RabinKarp(Algorithm):
6+
"""Rabin-Karp string search algorithm based on hash tables"""
7+
8+
def __init__(self, reference, hash_function=hash):
9+
self.reference = reference
10+
self.hash_function = hash_function
11+
12+
13+
@property
14+
def name(self):
15+
return 'Rabin-Karp'
16+
17+
18+
def set_candidate(self, candidate, **params):
19+
self.candidate = candidate
20+
21+
22+
def search(self, multiple_search=False) -> list:
23+
24+
offset_lst = []
25+
len_reference = len(self.reference)
26+
len_candidate = len(self.candidate)
27+
candidate_hash = self.hash_function(self.candidate)
28+
29+
for offset in range(int(np.ceil(len_reference / len_candidate))):
30+
reference_hash = hash(self.reference[offset:(offset + len_candidate)])
31+
if reference_hash == candidate_hash:
32+
i = 0
33+
while self.reference[offset + i] == self.candidate[i]:
34+
if (i + 1) == len_candidate:
35+
offset_lst.append(offset)
36+
if not multiple_search:
37+
return offset_lst
38+
else:
39+
break
40+
i += 1
41+
return offset_lst

Diff for: utils/tools.py

+98
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
import random
2+
import numpy as np
3+
import pandas as pd
4+
import matplotlib.pyplot as plt
5+
from datetime import datetime
6+
7+
from algorithm.base import Algorithm
8+
from algorithm.naive import BruteForce
9+
from algorithm.rabin_karp import RabinKarp
10+
11+
12+
def gen_random_string(dictionary, length):
13+
return ''.join(random.choices(dictionary, k=length))
14+
15+
16+
def gen_string_from_string(str, length=None, **params):
17+
if length:
18+
return str[:length]
19+
else:
20+
return str
21+
22+
23+
def generate_stat(algorithms,
24+
set_params,
25+
gen_string,
26+
dictionary, reference_len, candidate_len,
27+
n_observations,
28+
**params):
29+
30+
# sanity checks
31+
assert len(reference_len) == len(candidate_len)
32+
assert len(algorithms) == len(set_params)
33+
for algorithm in algorithms:
34+
assert isinstance(algorithm(''), Algorithm)
35+
36+
info_dct = {
37+
'algorithm': [],
38+
'reference_len': [],
39+
'candidate_len': [],
40+
'preprocessing': [],
41+
'execution': [],
42+
'observation': [],
43+
'indexes': []
44+
}
45+
46+
for refer_len, candid_len in zip(reference_len, candidate_len):
47+
48+
for observation in range(n_observations):
49+
50+
reference = gen_string(dictionary, refer_len)
51+
candidate = gen_string(dictionary, candid_len)
52+
53+
for algorithm, params in zip(algorithms, set_params):
54+
55+
start_time = datetime.now()
56+
alg = algorithm(reference)
57+
alg.set_candidate(candidate, **params)
58+
preprocess = datetime.now() - start_time
59+
60+
start_time = datetime.now()
61+
indexes = alg.search(multiple_search=True)
62+
execution = datetime.now() - start_time
63+
64+
info_dct['algorithm'] += [alg.name]
65+
info_dct['reference_len'] += [refer_len]
66+
info_dct['candidate_len'] += [candid_len]
67+
info_dct['preprocessing'] += [preprocess.total_seconds()]
68+
info_dct['execution'] += [execution.total_seconds()]
69+
info_dct['observation'] += [observation]
70+
info_dct['indexes'] += [str(indexes)]
71+
72+
return pd.DataFrame.from_dict(info_dct)
73+
74+
75+
def get_plots(stat_df,
76+
figsize=(14, 6),
77+
title='Execution time of algorithms'):
78+
79+
plt.figure(figsize=figsize)
80+
81+
for alg in list(stat_df.algorithm.unique()):
82+
83+
ox = stat_df[stat_df.algorithm == alg]['reference_len']
84+
oy = stat_df[stat_df.algorithm == alg]['execution mean']
85+
oy_std = stat_df[stat_df.algorithm == alg]['execution std']
86+
87+
p = plt.plot(ox, oy, '.-', label=alg)
88+
plt.fill_between(ox,
89+
oy - 3 * oy_std,
90+
oy + 3 * oy_std,
91+
color=p[0].get_color(), alpha=0.3,
92+
label='Confidence interval of 95% ' + alg)
93+
plt.title(title)
94+
plt.xlabel('Reference string length')
95+
plt.ylabel('Time, seconds')
96+
plt.legend()
97+
plt.grid()
98+
plt.show()

0 commit comments

Comments
 (0)