initial commit

VirtualRoyalty · VirtualRoyalty · commit 53c26c4d503e · 2020-04-12T00:43:44.000+03:00
diff --git a/StringSearching.ipynb b/StringSearching.ipynb
diff --git a/algorithm/base.py b/algorithm/base.py
@@ -0,0 +1,17 @@
+import abc as ABC
+
+from abc import ABC, abstractmethod, abstractproperty
+
+class Algorithm(ABC):
+
+    @abstractmethod
+    def set_candidate(self):
+        pass
+
+    @abstractmethod
+    def search(self):
+        pass
+
+    @abstractproperty
+    def name(self):
+        pass
diff --git a/algorithm/naive.py b/algorithm/naive.py
@@ -0,0 +1,35 @@
+from algorithm.base import Algorithm
+
+
+class BruteForce(Algorithm):
+    """Naive string search algorithm"""
+
+    def __init__(self, reference):
+        self.reference = reference
+
+    @property
+    def name(self):
+        return 'Brute Force'
+
+
+    def set_candidate(self, candidate, **params):
+        # some preprocessing
+        # ...
+        self.candidate = candidate
+
+
+    def search(self, multiple_search=False) -> list:
+        offset_lst    = []
+        len_reference = len(self.reference)
+        len_candidate = len(self.candidate)
+        for offset in range(len_reference - len_candidate):
+            i = 0
+            while self.reference[offset + i] == self.candidate[i]:
+                if (i + 1) == len_candidate:
+                    offset_lst.append(offset)
+                    if not multiple_search:
+                        return offset_lst
+                    else:
+                        break
+                i += 1
+        return offset_lst
diff --git a/algorithm/rabin_karp.py b/algorithm/rabin_karp.py
@@ -0,0 +1,41 @@
+import numpy as np
+from algorithm.base import Algorithm
+
+
+class RabinKarp(Algorithm):
+    """Rabin-Karp string search algorithm based on hash tables"""
+
+    def __init__(self, reference, hash_function=hash):
+        self.reference     = reference
+        self.hash_function = hash_function
+
+
+    @property
+    def name(self):
+        return 'Rabin-Karp'
+
+
+    def set_candidate(self, candidate, **params):
+        self.candidate = candidate
+
+
+    def search(self, multiple_search=False) -> list:
+
+        offset_lst     = []
+        len_reference  = len(self.reference)
+        len_candidate  = len(self.candidate)
+        candidate_hash = self.hash_function(self.candidate)
+
+        for offset in range(int(np.ceil(len_reference / len_candidate))):
+            reference_hash = hash(self.reference[offset:(offset + len_candidate)])
+            if reference_hash == candidate_hash:
+                i = 0
+                while self.reference[offset + i] == self.candidate[i]:
+                    if (i + 1) == len_candidate:
+                        offset_lst.append(offset)
+                        if not multiple_search:
+                            return offset_lst
+                        else:
+                            break
+                    i += 1
+        return offset_lst
diff --git a/utils/tools.py b/utils/tools.py
@@ -0,0 +1,98 @@
+import random
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from datetime import datetime
+
+from algorithm.base import Algorithm
+from algorithm.naive import BruteForce
+from algorithm.rabin_karp import RabinKarp
+
+
+def gen_random_string(dictionary, length):
+    return ''.join(random.choices(dictionary, k=length))
+
+
+def gen_string_from_string(str, length=None, **params):
+    if length:
+        return str[:length]
+    else:
+        return str
+
+
+def generate_stat(algorithms,
+                  set_params,
+                  gen_string,
+                  dictionary, reference_len, candidate_len,
+                  n_observations,
+                  **params):
+
+    # sanity checks
+    assert len(reference_len) == len(candidate_len)
+    assert len(algorithms)    == len(set_params)
+    for algorithm in algorithms:
+        assert isinstance(algorithm(''), Algorithm)
+
+    info_dct = {
+                     'algorithm':     [],
+                     'reference_len': [],
+                     'candidate_len': [],
+                     'preprocessing': [],
+                     'execution':     [],
+                     'observation':   [],
+                     'indexes':       []
+                   }
+
+    for refer_len, candid_len in zip(reference_len, candidate_len):
+
+        for observation in range(n_observations):
+
+            reference = gen_string(dictionary, refer_len)
+            candidate = gen_string(dictionary, candid_len)
+
+            for algorithm, params in zip(algorithms, set_params):
+
+                start_time = datetime.now()
+                alg = algorithm(reference)
+                alg.set_candidate(candidate, **params)
+                preprocess = datetime.now() - start_time
+
+                start_time = datetime.now()
+                indexes = alg.search(multiple_search=True)
+                execution = datetime.now() - start_time
+
+                info_dct['algorithm']     += [alg.name]
+                info_dct['reference_len'] += [refer_len]
+                info_dct['candidate_len'] += [candid_len]
+                info_dct['preprocessing'] += [preprocess.total_seconds()]
+                info_dct['execution']     += [execution.total_seconds()]
+                info_dct['observation']   += [observation]
+                info_dct['indexes']       += [str(indexes)]
+
+    return pd.DataFrame.from_dict(info_dct)
+
+
+def get_plots(stat_df,
+              figsize=(14, 6),
+              title='Execution time of algorithms'):
+
+    plt.figure(figsize=figsize)
+
+    for alg in list(stat_df.algorithm.unique()):
+
+        ox     = stat_df[stat_df.algorithm == alg]['reference_len']
+        oy     = stat_df[stat_df.algorithm == alg]['execution mean']
+        oy_std = stat_df[stat_df.algorithm == alg]['execution std']
+
+        p = plt.plot(ox, oy, '.-', label=alg)
+        plt.fill_between(ox,
+                         oy - 3 * oy_std,
+                         oy + 3 * oy_std,
+                         color=p[0].get_color(), alpha=0.3,
+                         label='Confidence interval of 95% ' + alg)
+    plt.title(title)
+    plt.xlabel('Reference string length')
+    plt.ylabel('Time, seconds')
+    plt.legend()
+    plt.grid()
+    plt.show()
diff --git a/Все идет по плану.txt b/Все идет по плану.txt