|
| 1 | +--- |
| 2 | +id: aho-corasick-algorithm |
| 3 | +title: Aho-Corasick Algorithm for Efficient String Matching |
| 4 | +sidebar_label: 0011 - Aho-Corasick Algorithm |
| 5 | +tags: [Aho-Corasick, String Matching, Algorithm, C++, Problem Solving] |
| 6 | +description: This is a solution for implementing the Aho-Corasick Algorithm to search multiple patterns simultaneously in a given text. |
| 7 | +--- |
| 8 | + |
| 9 | +## Problem Statement |
| 10 | + |
| 11 | +### Problem Description |
| 12 | + |
| 13 | +The Aho-Corasick Algorithm is designed for searching multiple patterns simultaneously within a given text. It constructs a finite state machine that resembles a digital tree with additional links between nodes, allowing efficient transitions between patterns. |
| 14 | + |
| 15 | +### Examples |
| 16 | + |
| 17 | +**Example 1:** |
| 18 | + |
| 19 | +```plaintext |
| 20 | +Input: |
| 21 | +Patterns: {"he", "she", "his", "hers"} |
| 22 | +Text: "ahishers" |
| 23 | +Output: |
| 24 | +Pattern found: he at index 1 |
| 25 | +Pattern found: his at index 1 |
| 26 | +Pattern found: she at index 3 |
| 27 | +Pattern found: hers at index 4 |
| 28 | +
|
| 29 | +Explanation: All the patterns are efficiently found within the text. |
| 30 | +``` |
| 31 | + |
| 32 | +### Constraints |
| 33 | + |
| 34 | +- The input consists of multiple patterns and a single text. |
| 35 | +- The algorithm should handle large patterns and text sizes efficiently. |
| 36 | + |
| 37 | +## Solution of Given Problem |
| 38 | + |
| 39 | +### Intuition and Approach |
| 40 | + |
| 41 | +The Aho-Corasick Algorithm follows these steps: |
| 42 | + |
| 43 | +1. Build a Trie: Insert all patterns into a trie. |
| 44 | +2. Construct Failure Links: Create failure links to enable efficient transitions when a mismatch occurs. |
| 45 | +3. Search the Text: Use the trie and failure links to search the text for all patterns simultaneously. |
| 46 | + |
| 47 | +### Approaches |
| 48 | + |
| 49 | +#### Codes in Different Languages |
| 50 | + |
| 51 | +<Tabs> |
| 52 | + <TabItem value="cpp" label="C++"> |
| 53 | + <SolutionAuthor name="sjain1909"/> |
| 54 | + ```cpp |
| 55 | + #include <bits/stdc++.h> |
| 56 | + using namespace std; |
| 57 | + |
| 58 | + struct TrieNode { |
| 59 | + unordered_map<char, TrieNode*> children; |
| 60 | + TrieNode* failure; |
| 61 | + vector<int> output; |
| 62 | + TrieNode() : failure(nullptr) {} |
| 63 | +}; |
| 64 | + |
| 65 | +class AhoCorasick { |
| 66 | + TrieNode* root; |
| 67 | + vector<string> patterns; |
| 68 | + |
| 69 | +public: |
| 70 | + AhoCorasick() { |
| 71 | + root = new TrieNode(); |
| 72 | + } |
| 73 | + |
| 74 | + void addPattern(const string& pattern, int index) { |
| 75 | + TrieNode* node = root; |
| 76 | + for (char c : pattern) { |
| 77 | + if (node->children.find(c) == node->children.end()) { |
| 78 | + node->children[c] = new TrieNode(); |
| 79 | + } |
| 80 | + node = node->children[c]; |
| 81 | + } |
| 82 | + node->output.push_back(index); |
| 83 | + } |
| 84 | + |
| 85 | + void buildFailureLinks() { |
| 86 | + queue<TrieNode*> q; |
| 87 | + root->failure = root; |
| 88 | + for (auto& child : root->children) { |
| 89 | + child.second->failure = root; |
| 90 | + q.push(child.second); |
| 91 | + } |
| 92 | + |
| 93 | + while (!q.empty()) { |
| 94 | + TrieNode* current = q.front(); |
| 95 | + q.pop(); |
| 96 | + |
| 97 | + for (auto& child : current->children) { |
| 98 | + char c = child.first; |
| 99 | + TrieNode* fail = current->failure; |
| 100 | + |
| 101 | + while (fail != root && fail->children.find(c) == fail->children.end()) { |
| 102 | + fail = fail->failure; |
| 103 | + } |
| 104 | + |
| 105 | + if (fail->children.find(c) != fail->children.end()) { |
| 106 | + child.second->failure = fail->children[c]; |
| 107 | + } else { |
| 108 | + child.second->failure = root; |
| 109 | + } |
| 110 | + |
| 111 | + child.second->output.insert(child.second->output.end(), |
| 112 | + child.second->failure->output.begin(), child.second->failure->output.end()); |
| 113 | + |
| 114 | + q.push(child.second); |
| 115 | + } |
| 116 | + } |
| 117 | + } |
| 118 | + |
| 119 | + void search(const string& text) { |
| 120 | + TrieNode* node = root; |
| 121 | + for (int i = 0; i < text.size(); ++i) { |
| 122 | + char c = text[i]; |
| 123 | + while (node != root && node->children.find(c) == node->children.end()) { |
| 124 | + node = node->failure; |
| 125 | + } |
| 126 | + |
| 127 | + if (node->children.find(c) != node->children.end()) { |
| 128 | + node = node->children[c]; |
| 129 | + } else { |
| 130 | + node = root; |
| 131 | + } |
| 132 | + |
| 133 | + if (!node->output.empty()) { |
| 134 | + for (int index : node->output) { |
| 135 | + cout << "Pattern found: " << patterns[index] << " at index " << i - patterns[index].size() + 1 << "\n"; |
| 136 | + } |
| 137 | + } |
| 138 | + } |
| 139 | + } |
| 140 | + |
| 141 | + void initialize(const vector<string>& patterns) { |
| 142 | + this->patterns = patterns; |
| 143 | + for (int i = 0; i < patterns.size(); ++i) { |
| 144 | + addPattern(patterns[i], i); |
| 145 | + } |
| 146 | + buildFailureLinks(); |
| 147 | + } |
| 148 | +}; |
| 149 | + |
| 150 | +int main() { |
| 151 | + int n; |
| 152 | + cout << "Enter number of patterns: "; |
| 153 | + cin >> n; |
| 154 | + vector<string> patterns(n); |
| 155 | + cout << "Enter patterns:\n"; |
| 156 | + for (int i = 0; i < n; ++i) { |
| 157 | + cin >> patterns[i]; |
| 158 | + } |
| 159 | + |
| 160 | + string text; |
| 161 | + cout << "Enter text: "; |
| 162 | + cin >> text; |
| 163 | + |
| 164 | + AhoCorasick ac; |
| 165 | + ac.initialize(patterns); |
| 166 | + ac.search(text); |
| 167 | + |
| 168 | + return 0; |
| 169 | +} |
| 170 | + ``` |
| 171 | + </TabItem> |
| 172 | +</Tabs> |
| 173 | + |
| 174 | +### Complexity Analysis |
| 175 | + |
| 176 | +- **Time Complexity:** $O(N + M + Z)$ where `N` is the length of the text, `M` is the total length of all patterns, and `Z` is the number of pattern occurrences. |
| 177 | +- **Space Complexity:** $O(M)$ |
| 178 | + |
| 179 | +The time complexity accounts for building the trie, constructing failure links, and searching the text. The space complexity is linear with respect to the total length of the patterns. |
| 180 | + |
| 181 | +## Video Explanation of Given Problem |
| 182 | + |
| 183 | + <LiteYouTubeEmbed |
| 184 | + id="VSvE-zpZtso" |
| 185 | + params="autoplay=1&autohide=1&showinfo=0&rel=0" |
| 186 | + title="Problem Explanation | Solution | Approach" |
| 187 | + poster="maxresdefault" |
| 188 | + webp |
| 189 | + /> |
| 190 | +--- |
| 191 | + |
| 192 | +<h2>Authors:</h2> |
| 193 | + |
| 194 | +<div style={{display: 'flex', flexWrap: 'wrap', justifyContent: 'space-between', gap: '10px'}}> |
| 195 | +{['sjain1909'].map(username => ( |
| 196 | + <Author key={username} username={username} /> |
| 197 | +))} |
| 198 | +</div> |
0 commit comments