|
| 1 | +/** |
| 2 | + * @file |
| 3 | + * @brief |
| 4 | + * The [Boyer–Moore](https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm) algorithm searches for occurrences of pattern P in text T by |
| 5 | + * performing explicit character comparisons at different alignments. Instead of |
| 6 | + * a brute-force search of all alignments (of which there are n - m + 1), |
| 7 | + * Boyer–Moore uses information gained by preprocessing P to skip as many |
| 8 | + * alignments as possible. |
| 9 | + * |
| 10 | + * @details |
| 11 | + * The key insight in this algorithm is that if the end of the pattern is |
| 12 | + * compared to the text, then jumps along the text can be made rather than |
| 13 | + * checking every character of the text. The reason that this works is that in |
| 14 | + * lining up the pattern against the text, the last character of the pattern is |
| 15 | + * compared to the character in the text. |
| 16 | + * |
| 17 | + * If the characters do not match, there is no need to continue searching |
| 18 | + * backwards along the text. This leaves us with two cases. |
| 19 | + * |
| 20 | + * Case 1: |
| 21 | + * If the character in the text does not match any of the characters in the |
| 22 | + * pattern, then the next character in the text to check is located m characters |
| 23 | + * farther along the text, where m is the length of the pattern. |
| 24 | + * |
| 25 | + * Case 2: |
| 26 | + * If the character in the text is in the pattern, then a partial shift of the |
| 27 | + * pattern along the text is done to line up along the matching character and |
| 28 | + * the process is repeated. |
| 29 | + * |
| 30 | + * There are two shift rules: |
| 31 | + * |
| 32 | + * [The bad character rule] |
| 33 | + * (https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm#The_bad_character_rule) |
| 34 | + * |
| 35 | + * [The good suffix rule] |
| 36 | + * (https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm#The_good_suffix_rule) |
| 37 | + * |
| 38 | + * The shift rules are implemented as constant-time table lookups, using tables |
| 39 | + * generated during the preprocessing of P. |
| 40 | + * @author [Stoycho Kyosev](https://github.com/stoychoX) |
| 41 | + */ |
| 42 | + |
| 43 | +#include <cassert> /// for assert |
| 44 | +#include <climits> /// for CHAR_MAX macro |
| 45 | +#include <cstring> /// for strlen |
| 46 | +#include <iostream> /// for IO operations |
| 47 | +#include <string> /// for std::string |
| 48 | +#include <vector> /// for std::vector |
| 49 | + |
| 50 | +#define APLHABET_SIZE CHAR_MAX ///< number of symbols in the alphabet we use |
| 51 | + |
| 52 | +/** |
| 53 | + * @namespace |
| 54 | + * @brief String algorithms |
| 55 | + */ |
| 56 | +namespace strings { |
| 57 | +/** |
| 58 | + * @namespace |
| 59 | + * @brief Functions for the [Boyer |
| 60 | + * Moore](https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm) |
| 61 | + * algorithm implementation |
| 62 | + */ |
| 63 | +namespace boyer_moore { |
| 64 | +/** |
| 65 | + * @brief A structure representing all the data we need to search the |
| 66 | + * preprocessed pattern in text. |
| 67 | + */ |
| 68 | +struct pattern { |
| 69 | + std::string pat; |
| 70 | + |
| 71 | + std::vector<size_t> |
| 72 | + bad_char; ///< bad char table used in [Bad Character |
| 73 | + ///< Heuristic](https://www.geeksforgeeks.org/boyer-moore-algorithm-for-pattern-searching/) |
| 74 | + |
| 75 | + std::vector<size_t> |
| 76 | + good_suffix; ///< good suffix table used for [Good Suffix |
| 77 | + ///< heuristic](https://www.geeksforgeeks.org/boyer-moore-algorithm-good-suffix-heuristic/?ref=rp) |
| 78 | +}; |
| 79 | + |
| 80 | +/** |
| 81 | + * @brief A function that preprocess the good suffix thable |
| 82 | + * |
| 83 | + * @param str The string being preprocessed |
| 84 | + * @param arg The good suffix table |
| 85 | + * @returns void |
| 86 | + */ |
| 87 | +void init_good_suffix(const std::string& str, std::vector<size_t>& arg) { |
| 88 | + arg.resize(str.size() + 1, 0); |
| 89 | + |
| 90 | + // border_pos[i] - the index of the longest proper suffix of str[i..] which |
| 91 | + // is also a proper prefix. |
| 92 | + std::vector<size_t> border_pos(str.size() + 1, 0); |
| 93 | + |
| 94 | + size_t current_char = str.length(); |
| 95 | + |
| 96 | + size_t border_index = str.length() + 1; |
| 97 | + |
| 98 | + border_pos[current_char] = border_index; |
| 99 | + |
| 100 | + while (current_char > 0) { |
| 101 | + while (border_index <= str.length() && |
| 102 | + str[current_char - 1] != str[border_index - 1]) { |
| 103 | + if (arg[border_index] == 0) { |
| 104 | + arg[border_index] = border_index - current_char; |
| 105 | + } |
| 106 | + |
| 107 | + border_index = border_pos[border_index]; |
| 108 | + } |
| 109 | + |
| 110 | + current_char--; |
| 111 | + border_index--; |
| 112 | + border_pos[current_char] = border_index; |
| 113 | + } |
| 114 | + |
| 115 | + size_t largest_border_index = border_pos[0]; |
| 116 | + |
| 117 | + for (size_t i = 0; i < str.size(); i++) { |
| 118 | + if (arg[i] == 0) { |
| 119 | + arg[i] = largest_border_index; |
| 120 | + } |
| 121 | + |
| 122 | + // If we go pass the largest border we find the next one as we iterate |
| 123 | + if (i == largest_border_index) { |
| 124 | + largest_border_index = border_pos[largest_border_index]; |
| 125 | + } |
| 126 | + } |
| 127 | +} |
| 128 | + |
| 129 | +/** |
| 130 | + * @brief A function that preprocess the bad char table |
| 131 | + * |
| 132 | + * @param str The string being preprocessed |
| 133 | + * @param arg The bad char table |
| 134 | + * @returns void |
| 135 | + */ |
| 136 | +void init_bad_char(const std::string& str, std::vector<size_t>& arg) { |
| 137 | + arg.resize(APLHABET_SIZE, str.length()); |
| 138 | + |
| 139 | + for (size_t i = 0; i < str.length(); i++) { |
| 140 | + arg[str[i]] = str.length() - i - 1; |
| 141 | + } |
| 142 | +} |
| 143 | + |
| 144 | +/** |
| 145 | + * @brief A function that initializes pattern |
| 146 | + * |
| 147 | + * @param str Text used for initialization |
| 148 | + * @param arg Initialized structure |
| 149 | + * @returns void |
| 150 | + */ |
| 151 | +void init_pattern(const std::string& str, pattern& arg) { |
| 152 | + arg.pat = str; |
| 153 | + init_bad_char(str, arg.bad_char); |
| 154 | + init_good_suffix(str, arg.good_suffix); |
| 155 | +} |
| 156 | +/** |
| 157 | + * @brief A function that implements Boyer-Moore's algorithm. |
| 158 | + * |
| 159 | + * @param str Text we are seatching in. |
| 160 | + * @param arg pattern structure containing the preprocessed pattern |
| 161 | + * @return Vector of indexes of the occurrences of pattern in text |
| 162 | + */ |
| 163 | +std::vector<size_t> search(const std::string& str, const pattern& arg) { |
| 164 | + size_t index_position = arg.pat.size() - 1; |
| 165 | + std::vector<size_t> index_storage; |
| 166 | + |
| 167 | + while (index_position < str.length()) { |
| 168 | + size_t index_string = index_position; |
| 169 | + int index_pattern = static_cast<int>(arg.pat.size()) - 1; |
| 170 | + |
| 171 | + while (index_pattern >= 0 && |
| 172 | + str[index_string] == arg.pat[index_pattern]) { |
| 173 | + --index_pattern; |
| 174 | + --index_string; |
| 175 | + } |
| 176 | + |
| 177 | + if (index_pattern < 0) { |
| 178 | + index_storage.push_back(index_position - arg.pat.length() + 1); |
| 179 | + index_position += arg.good_suffix[0]; |
| 180 | + } else { |
| 181 | + index_position += std::max(arg.bad_char[str[index_string]], |
| 182 | + arg.good_suffix[index_pattern + 1]); |
| 183 | + } |
| 184 | + } |
| 185 | + |
| 186 | + return index_storage; |
| 187 | +} |
| 188 | + |
| 189 | +/** |
| 190 | + * @brief Check if pat is prefix of str. |
| 191 | + * |
| 192 | + * @param str pointer to some part of the input text. |
| 193 | + * @param pat the searched pattern. |
| 194 | + * @param len length of the searched pattern |
| 195 | + * @returns `true` if pat IS prefix of str. |
| 196 | + * @returns `false` if pat is NOT a prefix of str. |
| 197 | + */ |
| 198 | +bool is_prefix(const char* str, const char* pat, size_t len) { |
| 199 | + if (strlen(str) < len) { |
| 200 | + return false; |
| 201 | + } |
| 202 | + |
| 203 | + for (size_t i = 0; i < len; i++) { |
| 204 | + if (str[i] != pat[i]) { |
| 205 | + return false; |
| 206 | + } |
| 207 | + } |
| 208 | + |
| 209 | + return true; |
| 210 | +} |
| 211 | +} // namespace boyer_moore |
| 212 | +} // namespace strings |
| 213 | +/** |
| 214 | + * @brief A test case in which we search for every appearance of the word 'and' |
| 215 | + * @param text The text in which we search for appearance of the word 'and' |
| 216 | + * @returns void |
| 217 | + */ |
| 218 | +void and_test(const char* text) { |
| 219 | + strings::boyer_moore::pattern ands; |
| 220 | + strings::boyer_moore::init_pattern("and", ands); |
| 221 | + std::vector<size_t> indexes = strings::boyer_moore::search(text, ands); |
| 222 | + |
| 223 | + assert(indexes.size() == 2); |
| 224 | + assert(strings::boyer_moore::is_prefix(text + indexes[0], "and", 3)); |
| 225 | + assert(strings::boyer_moore::is_prefix(text + indexes[1], "and", 3)); |
| 226 | +} |
| 227 | + |
| 228 | +/** |
| 229 | + * @brief A test case in which we search for every appearance of the word 'pat' |
| 230 | + * @param text The text in which we search for appearance of the word 'pat' |
| 231 | + * @returns void |
| 232 | + */ |
| 233 | +void pat_test(const char* text) { |
| 234 | + strings::boyer_moore::pattern pat; |
| 235 | + strings::boyer_moore::init_pattern("pat", pat); |
| 236 | + std::vector<size_t> indexes = strings::boyer_moore::search(text, pat); |
| 237 | + |
| 238 | + assert(indexes.size() == 6); |
| 239 | + |
| 240 | + for (const auto& currentIndex : indexes) { |
| 241 | + assert(strings::boyer_moore::is_prefix(text + currentIndex, "pat", 3)); |
| 242 | + } |
| 243 | +} |
| 244 | +/** |
| 245 | + * @brief Self-test implementations |
| 246 | + * @returns void |
| 247 | + */ |
| 248 | +static void tests() { |
| 249 | + const char* text = |
| 250 | + "When pat Mr. and Mrs. pat Dursley woke up on the dull, gray \ |
| 251 | + Tuesday our story starts, \ |
| 252 | + there was nothing about pat the cloudy sky outside to pat suggest that\ |
| 253 | + strange and \ |
| 254 | + mysterious things would pat soon be happening all pat over the \ |
| 255 | + country."; |
| 256 | + |
| 257 | + and_test(text); |
| 258 | + pat_test(text); |
| 259 | + |
| 260 | + std::cout << "All tests have successfully passed!\n"; |
| 261 | +} |
| 262 | + |
| 263 | +/** |
| 264 | + * @brief Main function |
| 265 | + * @returns 0 on exit |
| 266 | + */ |
| 267 | +int main() { |
| 268 | + tests(); // run self-test implementations |
| 269 | + return 0; |
| 270 | +} |
0 commit comments