Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add C++ backend for strings module #671

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
143 changes: 143 additions & 0 deletions pydatastructs/strings/_backend/cpp/algorithms/algorithms.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
#include "algorithms.hpp"
#include <vector>
#include <string>
#include <unordered_map>
#include <cmath>

// Knuth-Morris-Pratt Algorithm
std::vector<int> kmp_search(const std::string& text, const std::string& query) {
std::vector<int> positions;
if (text.empty() || query.empty()) return positions;

// Build KMP table
std::vector<int> kmp_table(query.size() + 1, 0);
kmp_table[0] = -1;
int pos = 1, cnd = 0;
while (pos < query.size()) {
if (query[pos] == query[cnd]) {
kmp_table[pos] = kmp_table[cnd];
} else {
kmp_table[pos] = cnd;
while (cnd >= 0 && query[pos] != query[cnd]) {
cnd = kmp_table[cnd];
}
}
pos++, cnd++;
}
kmp_table[pos] = cnd;

// Perform search
int j = 0, k = 0;
while (j < text.size()) {
if (query[k] == text[j]) {
j++, k++;
if (k == query.size()) {
positions.push_back(j - k);
k = kmp_table[k];
}
} else {
k = kmp_table[k];
if (k < 0) {
j++, k++;
}
}
}

return positions;
}

// Rabin-Karp Algorithm
std::vector<int> rabin_karp_search(const std::string& text, const std::string& query) {
std::vector<int> positions;
if (text.empty() || query.empty()) return positions;

const int PRIME = 257;
const int MOD = 1000000007;
int t = text.size(), q = query.size();
long long query_hash = 0, text_hash = 0, power = 1;

// Precompute power
for (int i = 0; i < q - 1; i++) {
power = (power * PRIME) % MOD;
}

// Compute hash for query and first window of text
for (int i = 0; i < q; i++) {
query_hash = (query_hash * PRIME + query[i]) % MOD;
text_hash = (text_hash * PRIME + text[i]) % MOD;
}

// Slide the window over the text
for (int i = 0; i <= t - q; i++) {
if (query_hash == text_hash) {
if (text.substr(i, q) == query) {
positions.push_back(i);
}
}
if (i < t - q) {
text_hash = (PRIME * (text_hash - text[i] * power) + text[i + q]) % MOD;
if (text_hash < 0) text_hash += MOD;
}
}

return positions;
}

// Boyer-Moore Algorithm
std::vector<int> boyer_moore_search(const std::string& text, const std::string& query) {
std::vector<int> positions;
if (text.empty() || query.empty()) return positions;

// Preprocessing
std::unordered_map<char, int> bad_match_table;
for (int i = 0; i < query.size(); i++) {
bad_match_table[query[i]] = i;
}

// Searching
int shift = 0;
while (shift <= text.size() - query.size()) {
int j = query.size() - 1;
while (j >= 0 && query[j] == text[shift + j]) {
j--;
}
if (j < 0) {
positions.push_back(shift);
shift += (shift + query.size() < text.size()) ? query.size() - bad_match_table[text[shift + query.size()]] : 1;
} else {
shift += std::max(1, j - bad_match_table[text[shift + j]]);
}
}

return positions;
}

// Z-Function Algorithm
std::vector<int> z_function_search(const std::string& text, const std::string& query) {
std::vector<int> positions;
if (text.empty() || query.empty()) return positions;

std::string combined = query + "$" + text;
std::vector<int> z(combined.size(), 0);
int l = 0, r = 0;

for (int i = 1; i < combined.size(); i++) {
if (i <= r) {
z[i] = std::min(r - i + 1, z[i - l]);
}
while (i + z[i] < combined.size() && combined[z[i]] == combined[i + z[i]]) {
z[i]++;
}
if (i + z[i] - 1 > r) {
l = i, r = i + z[i] - 1;
}
}

for (int i = query.size() + 1; i < combined.size(); i++) {
if (z[i] == query.size()) {
positions.push_back(i - query.size() - 1);
}
}

return positions;
}
22 changes: 22 additions & 0 deletions pydatastructs/strings/_backend/cpp/algorithms/algorithms.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#ifndef STRINGS_ALGORITHMS_HPP
#define STRINGS_ALGORITHMS_HPP

#include <Python.h>
#include <vector>
#include <string>
#include <unordered_map>
#include <cmath>

// Knuth-Morris-Pratt Algorithm
std::vector<int> kmp_search(const std::string& text, const std::string& query);

// Rabin-Karp Algorithm
std::vector<int> rabin_karp_search(const std::string& text, const std::string& query);

// Boyer-Moore Algorithm
std::vector<int> boyer_moore_search(const std::string& text, const std::string& query);

// Z-Function Algorithm
std::vector<int> z_function_search(const std::string& text, const std::string& query);

#endif
75 changes: 75 additions & 0 deletions pydatastructs/strings/_backend/cpp/string.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#include <Python.h>
#include "algorithms/algorithms.hpp"
#include "trie/trie.hpp"
#include "utils/_backend/cpp/string.hpp"

// Python wrapper for KMP algorithm
static PyObject* py_kmp_search(PyObject* self, PyObject* args) {
PyObject *text_obj, *query_obj;
if (!PyArg_ParseTuple(args, "OO", &text_obj, &query_obj)) {
return NULL;
}
std::string text = pyobj_to_string(text_obj);
std::string query = pyobj_to_string(query_obj);
std::vector<int> positions = kmp_search(text, query);
return vector_to_pylist(positions);
}

// Python wrapper for Rabin-Karp algorithm
static PyObject* py_rabin_karp_search(PyObject* self, PyObject* args) {
PyObject *text_obj, *query_obj;
if (!PyArg_ParseTuple(args, "OO", &text_obj, &query_obj)) {
return NULL;
}
std::string text = pyobj_to_string(text_obj);
std::string query = pyobj_to_string(query_obj);
std::vector<int> positions = rabin_karp_search(text, query);
return vector_to_pylist(positions);
}

// Python wrapper for Boyer-Moore algorithm
static PyObject* py_boyer_moore_search(PyObject* self, PyObject* args) {
PyObject *text_obj, *query_obj;
if (!PyArg_ParseTuple(args, "OO", &text_obj, &query_obj)) {
return NULL;
}
std::string text = pyobj_to_string(text_obj);
std::string query = pyobj_to_string(query_obj);
std::vector<int> positions = boyer_moore_search(text, query);
return vector_to_pylist(positions);
}

// Python wrapper for Z-function algorithm
static PyObject* py_z_function_search(PyObject* self, PyObject* args) {
PyObject *text_obj, *query_obj;
if (!PyArg_ParseTuple(args, "OO", &text_obj, &query_obj)) {
return NULL;
}
std::string text = pyobj_to_string(text_obj);
std::string query = pyobj_to_string(query_obj);
std::vector<int> positions = z_function_search(text, query);
return vector_to_pylist(positions);
}

// Define the module's method table
static PyMethodDef StringsMethods[] = {
{"kmp_search", py_kmp_search, METH_VARARGS, "Perform KMP search"},
{"rabin_karp_search", py_rabin_karp_search, METH_VARARGS, "Perform Rabin-Karp search"},
{"boyer_moore_search", py_boyer_moore_search, METH_VARARGS, "Perform Boyer-Moore search"},
{"z_function_search", py_z_function_search, METH_VARARGS, "Perform Z-function search"},
{NULL, NULL, 0, NULL}
};

// Define the module
static struct PyModuleDef stringsmodule = {
PyModuleDef_HEAD_INIT,
"_strings",
NULL,
-1,
StringsMethods
};

// Module initialization function
PyMODINIT_FUNC PyInit__strings(void) {
return PyModule_Create(&stringsmodule);
}
73 changes: 73 additions & 0 deletions pydatastructs/strings/_backend/cpp/trie/trie.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#include "trie.hpp"

TrieNode::~TrieNode() {
for (auto& pair : children) {
delete pair.second;
}
}

Trie::Trie() {
root = new TrieNode();
}

Trie::~Trie() {
delete root;
}

void Trie::insert(const std::string& word) {
TrieNode* current = root;
for (char ch : word) {
if (current->children.find(ch) == current->children.end()) {
current->children[ch] = new TrieNode(ch);
}
current = current->children[ch];
}
current->is_terminal = true;
}

bool Trie::search(const std::string& word) {
TrieNode* current = root;
for (char ch : word) {
if (current->children.find(ch) == current->children.end()) {
return false;
}
current = current->children[ch];
}
return current->is_terminal;
}

bool Trie::starts_with(const std::string& prefix) {
TrieNode* current = root;
for (char ch : prefix) {
if (current->children.find(ch) == current->children.end()) {
return false;
}
current = current->children[ch];
}
return true;
}

std::vector<std::string> Trie::strings_with_prefix(const std::string& prefix) {
std::vector<std::string> result;
TrieNode* current = root;
for (char ch : prefix) {
if (current->children.find(ch) == current->children.end()) {
return result;
}
current = current->children[ch];
}
// Perform DFS to collect all strings with the given prefix
std::vector<std::pair<TrieNode*, std::string>> stack;
stack.push_back({current, prefix});
while (!stack.empty()) {
auto [node, str] = stack.back();
stack.pop_back();
if (node->is_terminal) {
result.push_back(str);
}
for (auto& pair : node->children) {
stack.push_back({pair.second, str + pair.first});
}
}
return result;
}
32 changes: 32 additions & 0 deletions pydatastructs/strings/_backend/cpp/trie/trie.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#ifndef STRINGS_TRIE_HPP
#define STRINGS_TRIE_HPP

#include <Python.h>
#include <unordered_map>
#include <vector>
#include <string>

class TrieNode {
public:
char character;
bool is_terminal;
std::unordered_map<char, TrieNode*> children;

TrieNode(char ch = '\0') : character(ch), is_terminal(false) {}
~TrieNode();
};

class Trie {
public:
TrieNode* root;

Trie();
~Trie();

void insert(const std::string& word);
bool search(const std::string& word);
bool starts_with(const std::string& prefix);
std::vector<std::string> strings_with_prefix(const std::string& prefix);
};

#endif
35 changes: 35 additions & 0 deletions pydatastructs/strings/_extension.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from setuptools import Extension

project = 'pydatastructs'

module = 'strings'

backend = '_backend'

cpp = 'cpp'

# Define the extension for string algorithms
algorithms = '.'.join([project, module, backend, cpp, '_algorithms'])
algorithms_sources = [
'/'.join([project, module, backend, cpp, 'algorithms', 'algorithms.cpp']),
'/'.join([project, "utils", "_backend", "cpp", "string.cpp"])
]

# Define the extension for the Trie data structure
trie = '.'.join([project, module, backend, cpp, '_trie'])
trie_sources = [
'/'.join([project, module, backend, cpp, 'trie', 'trie.cpp']),
'/'.join([project, "utils", "_backend", "cpp", "string.cpp"])
]

# Define the extension for the main strings module
strings = '.'.join([project, module, backend, cpp, '_strings'])
strings_sources = [
'/'.join([project, module, backend, cpp, 'strings.cpp'])
]

extensions = [
Extension(algorithms, sources=algorithms_sources),
Extension(trie, sources=trie_sources),
Extension(strings, sources=strings_sources)
]
Loading
Loading