From 6c7761d5b2c9b3e666480d94bf90c8d6356fa6d9 Mon Sep 17 00:00:00 2001 From: asmit27rai Date: Mon, 24 Mar 2025 02:46:15 +0530 Subject: [PATCH] feat: Add C++ backend for strings module - Implemented C++ backend for string algorithms (KMP, Rabin-Karp, Boyer-Moore, Z-function). - Implemented C++ backend for Trie data structure. - Added Python-C++ interface using `Python.h`. - Updated `_extensions.py` and `setup.py` to integrate the new backend. --- pydatastructs/strings/_backend/__init__.py | 0 .../_backend/cpp/algorithms/algorithms.cpp | 143 ++++++++++++++++++ .../_backend/cpp/algorithms/algorithms.hpp | 22 +++ pydatastructs/strings/_backend/cpp/string.cpp | 75 +++++++++ .../strings/_backend/cpp/trie/trie.cpp | 73 +++++++++ .../strings/_backend/cpp/trie/trie.hpp | 32 ++++ pydatastructs/strings/_extension.py | 35 +++++ pydatastructs/utils/_backend/cpp/string.cpp | 21 +++ pydatastructs/utils/_backend/cpp/string.hpp | 17 +++ setup.py | 2 + 10 files changed, 420 insertions(+) create mode 100644 pydatastructs/strings/_backend/__init__.py create mode 100644 pydatastructs/strings/_backend/cpp/algorithms/algorithms.cpp create mode 100644 pydatastructs/strings/_backend/cpp/algorithms/algorithms.hpp create mode 100644 pydatastructs/strings/_backend/cpp/string.cpp create mode 100644 pydatastructs/strings/_backend/cpp/trie/trie.cpp create mode 100644 pydatastructs/strings/_backend/cpp/trie/trie.hpp create mode 100644 pydatastructs/strings/_extension.py create mode 100644 pydatastructs/utils/_backend/cpp/string.cpp create mode 100644 pydatastructs/utils/_backend/cpp/string.hpp diff --git a/pydatastructs/strings/_backend/__init__.py b/pydatastructs/strings/_backend/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pydatastructs/strings/_backend/cpp/algorithms/algorithms.cpp b/pydatastructs/strings/_backend/cpp/algorithms/algorithms.cpp new file mode 100644 index 000000000..4cd690bac --- /dev/null +++ b/pydatastructs/strings/_backend/cpp/algorithms/algorithms.cpp @@ -0,0 +1,143 @@ +#include "algorithms.hpp" +#include +#include +#include +#include + +// Knuth-Morris-Pratt Algorithm +std::vector kmp_search(const std::string& text, const std::string& query) { + std::vector positions; + if (text.empty() || query.empty()) return positions; + + // Build KMP table + std::vector kmp_table(query.size() + 1, 0); + kmp_table[0] = -1; + int pos = 1, cnd = 0; + while (pos < query.size()) { + if (query[pos] == query[cnd]) { + kmp_table[pos] = kmp_table[cnd]; + } else { + kmp_table[pos] = cnd; + while (cnd >= 0 && query[pos] != query[cnd]) { + cnd = kmp_table[cnd]; + } + } + pos++, cnd++; + } + kmp_table[pos] = cnd; + + // Perform search + int j = 0, k = 0; + while (j < text.size()) { + if (query[k] == text[j]) { + j++, k++; + if (k == query.size()) { + positions.push_back(j - k); + k = kmp_table[k]; + } + } else { + k = kmp_table[k]; + if (k < 0) { + j++, k++; + } + } + } + + return positions; +} + +// Rabin-Karp Algorithm +std::vector rabin_karp_search(const std::string& text, const std::string& query) { + std::vector positions; + if (text.empty() || query.empty()) return positions; + + const int PRIME = 257; + const int MOD = 1000000007; + int t = text.size(), q = query.size(); + long long query_hash = 0, text_hash = 0, power = 1; + + // Precompute power + for (int i = 0; i < q - 1; i++) { + power = (power * PRIME) % MOD; + } + + // Compute hash for query and first window of text + for (int i = 0; i < q; i++) { + query_hash = (query_hash * PRIME + query[i]) % MOD; + text_hash = (text_hash * PRIME + text[i]) % MOD; + } + + // Slide the window over the text + for (int i = 0; i <= t - q; i++) { + if (query_hash == text_hash) { + if (text.substr(i, q) == query) { + positions.push_back(i); + } + } + if (i < t - q) { + text_hash = (PRIME * (text_hash - text[i] * power) + text[i + q]) % MOD; + if (text_hash < 0) text_hash += MOD; + } + } + + return positions; +} + +// Boyer-Moore Algorithm +std::vector boyer_moore_search(const std::string& text, const std::string& query) { + std::vector positions; + if (text.empty() || query.empty()) return positions; + + // Preprocessing + std::unordered_map bad_match_table; + for (int i = 0; i < query.size(); i++) { + bad_match_table[query[i]] = i; + } + + // Searching + int shift = 0; + while (shift <= text.size() - query.size()) { + int j = query.size() - 1; + while (j >= 0 && query[j] == text[shift + j]) { + j--; + } + if (j < 0) { + positions.push_back(shift); + shift += (shift + query.size() < text.size()) ? query.size() - bad_match_table[text[shift + query.size()]] : 1; + } else { + shift += std::max(1, j - bad_match_table[text[shift + j]]); + } + } + + return positions; +} + +// Z-Function Algorithm +std::vector z_function_search(const std::string& text, const std::string& query) { + std::vector positions; + if (text.empty() || query.empty()) return positions; + + std::string combined = query + "$" + text; + std::vector z(combined.size(), 0); + int l = 0, r = 0; + + for (int i = 1; i < combined.size(); i++) { + if (i <= r) { + z[i] = std::min(r - i + 1, z[i - l]); + } + while (i + z[i] < combined.size() && combined[z[i]] == combined[i + z[i]]) { + z[i]++; + } + if (i + z[i] - 1 > r) { + l = i, r = i + z[i] - 1; + } + } + + for (int i = query.size() + 1; i < combined.size(); i++) { + if (z[i] == query.size()) { + positions.push_back(i - query.size() - 1); + } + } + + return positions; +} \ No newline at end of file diff --git a/pydatastructs/strings/_backend/cpp/algorithms/algorithms.hpp b/pydatastructs/strings/_backend/cpp/algorithms/algorithms.hpp new file mode 100644 index 000000000..f554d272f --- /dev/null +++ b/pydatastructs/strings/_backend/cpp/algorithms/algorithms.hpp @@ -0,0 +1,22 @@ +#ifndef STRINGS_ALGORITHMS_HPP +#define STRINGS_ALGORITHMS_HPP + +#include +#include +#include +#include +#include + +// Knuth-Morris-Pratt Algorithm +std::vector kmp_search(const std::string& text, const std::string& query); + +// Rabin-Karp Algorithm +std::vector rabin_karp_search(const std::string& text, const std::string& query); + +// Boyer-Moore Algorithm +std::vector boyer_moore_search(const std::string& text, const std::string& query); + +// Z-Function Algorithm +std::vector z_function_search(const std::string& text, const std::string& query); + +#endif \ No newline at end of file diff --git a/pydatastructs/strings/_backend/cpp/string.cpp b/pydatastructs/strings/_backend/cpp/string.cpp new file mode 100644 index 000000000..4de661202 --- /dev/null +++ b/pydatastructs/strings/_backend/cpp/string.cpp @@ -0,0 +1,75 @@ +#include +#include "algorithms/algorithms.hpp" +#include "trie/trie.hpp" +#include "utils/_backend/cpp/string.hpp" + +// Python wrapper for KMP algorithm +static PyObject* py_kmp_search(PyObject* self, PyObject* args) { + PyObject *text_obj, *query_obj; + if (!PyArg_ParseTuple(args, "OO", &text_obj, &query_obj)) { + return NULL; + } + std::string text = pyobj_to_string(text_obj); + std::string query = pyobj_to_string(query_obj); + std::vector positions = kmp_search(text, query); + return vector_to_pylist(positions); +} + +// Python wrapper for Rabin-Karp algorithm +static PyObject* py_rabin_karp_search(PyObject* self, PyObject* args) { + PyObject *text_obj, *query_obj; + if (!PyArg_ParseTuple(args, "OO", &text_obj, &query_obj)) { + return NULL; + } + std::string text = pyobj_to_string(text_obj); + std::string query = pyobj_to_string(query_obj); + std::vector positions = rabin_karp_search(text, query); + return vector_to_pylist(positions); +} + +// Python wrapper for Boyer-Moore algorithm +static PyObject* py_boyer_moore_search(PyObject* self, PyObject* args) { + PyObject *text_obj, *query_obj; + if (!PyArg_ParseTuple(args, "OO", &text_obj, &query_obj)) { + return NULL; + } + std::string text = pyobj_to_string(text_obj); + std::string query = pyobj_to_string(query_obj); + std::vector positions = boyer_moore_search(text, query); + return vector_to_pylist(positions); +} + +// Python wrapper for Z-function algorithm +static PyObject* py_z_function_search(PyObject* self, PyObject* args) { + PyObject *text_obj, *query_obj; + if (!PyArg_ParseTuple(args, "OO", &text_obj, &query_obj)) { + return NULL; + } + std::string text = pyobj_to_string(text_obj); + std::string query = pyobj_to_string(query_obj); + std::vector positions = z_function_search(text, query); + return vector_to_pylist(positions); +} + +// Define the module's method table +static PyMethodDef StringsMethods[] = { + {"kmp_search", py_kmp_search, METH_VARARGS, "Perform KMP search"}, + {"rabin_karp_search", py_rabin_karp_search, METH_VARARGS, "Perform Rabin-Karp search"}, + {"boyer_moore_search", py_boyer_moore_search, METH_VARARGS, "Perform Boyer-Moore search"}, + {"z_function_search", py_z_function_search, METH_VARARGS, "Perform Z-function search"}, + {NULL, NULL, 0, NULL} +}; + +// Define the module +static struct PyModuleDef stringsmodule = { + PyModuleDef_HEAD_INIT, + "_strings", + NULL, + -1, + StringsMethods +}; + +// Module initialization function +PyMODINIT_FUNC PyInit__strings(void) { + return PyModule_Create(&stringsmodule); +} \ No newline at end of file diff --git a/pydatastructs/strings/_backend/cpp/trie/trie.cpp b/pydatastructs/strings/_backend/cpp/trie/trie.cpp new file mode 100644 index 000000000..64ae52304 --- /dev/null +++ b/pydatastructs/strings/_backend/cpp/trie/trie.cpp @@ -0,0 +1,73 @@ +#include "trie.hpp" + +TrieNode::~TrieNode() { + for (auto& pair : children) { + delete pair.second; + } +} + +Trie::Trie() { + root = new TrieNode(); +} + +Trie::~Trie() { + delete root; +} + +void Trie::insert(const std::string& word) { + TrieNode* current = root; + for (char ch : word) { + if (current->children.find(ch) == current->children.end()) { + current->children[ch] = new TrieNode(ch); + } + current = current->children[ch]; + } + current->is_terminal = true; +} + +bool Trie::search(const std::string& word) { + TrieNode* current = root; + for (char ch : word) { + if (current->children.find(ch) == current->children.end()) { + return false; + } + current = current->children[ch]; + } + return current->is_terminal; +} + +bool Trie::starts_with(const std::string& prefix) { + TrieNode* current = root; + for (char ch : prefix) { + if (current->children.find(ch) == current->children.end()) { + return false; + } + current = current->children[ch]; + } + return true; +} + +std::vector Trie::strings_with_prefix(const std::string& prefix) { + std::vector result; + TrieNode* current = root; + for (char ch : prefix) { + if (current->children.find(ch) == current->children.end()) { + return result; + } + current = current->children[ch]; + } + // Perform DFS to collect all strings with the given prefix + std::vector> stack; + stack.push_back({current, prefix}); + while (!stack.empty()) { + auto [node, str] = stack.back(); + stack.pop_back(); + if (node->is_terminal) { + result.push_back(str); + } + for (auto& pair : node->children) { + stack.push_back({pair.second, str + pair.first}); + } + } + return result; +} \ No newline at end of file diff --git a/pydatastructs/strings/_backend/cpp/trie/trie.hpp b/pydatastructs/strings/_backend/cpp/trie/trie.hpp new file mode 100644 index 000000000..f52230bfc --- /dev/null +++ b/pydatastructs/strings/_backend/cpp/trie/trie.hpp @@ -0,0 +1,32 @@ +#ifndef STRINGS_TRIE_HPP +#define STRINGS_TRIE_HPP + +#include +#include +#include +#include + +class TrieNode { +public: + char character; + bool is_terminal; + std::unordered_map children; + + TrieNode(char ch = '\0') : character(ch), is_terminal(false) {} + ~TrieNode(); +}; + +class Trie { +public: + TrieNode* root; + + Trie(); + ~Trie(); + + void insert(const std::string& word); + bool search(const std::string& word); + bool starts_with(const std::string& prefix); + std::vector strings_with_prefix(const std::string& prefix); +}; + +#endif \ No newline at end of file diff --git a/pydatastructs/strings/_extension.py b/pydatastructs/strings/_extension.py new file mode 100644 index 000000000..fc2506506 --- /dev/null +++ b/pydatastructs/strings/_extension.py @@ -0,0 +1,35 @@ +from setuptools import Extension + +project = 'pydatastructs' + +module = 'strings' + +backend = '_backend' + +cpp = 'cpp' + +# Define the extension for string algorithms +algorithms = '.'.join([project, module, backend, cpp, '_algorithms']) +algorithms_sources = [ + '/'.join([project, module, backend, cpp, 'algorithms', 'algorithms.cpp']), + '/'.join([project, "utils", "_backend", "cpp", "string.cpp"]) +] + +# Define the extension for the Trie data structure +trie = '.'.join([project, module, backend, cpp, '_trie']) +trie_sources = [ + '/'.join([project, module, backend, cpp, 'trie', 'trie.cpp']), + '/'.join([project, "utils", "_backend", "cpp", "string.cpp"]) +] + +# Define the extension for the main strings module +strings = '.'.join([project, module, backend, cpp, '_strings']) +strings_sources = [ + '/'.join([project, module, backend, cpp, 'strings.cpp']) +] + +extensions = [ + Extension(algorithms, sources=algorithms_sources), + Extension(trie, sources=trie_sources), + Extension(strings, sources=strings_sources) +] \ No newline at end of file diff --git a/pydatastructs/utils/_backend/cpp/string.cpp b/pydatastructs/utils/_backend/cpp/string.cpp new file mode 100644 index 000000000..227a2b73e --- /dev/null +++ b/pydatastructs/utils/_backend/cpp/string.cpp @@ -0,0 +1,21 @@ +#include "utils.hpp" + +std::string pyobj_to_string(PyObject* obj) { + if (!PyUnicode_Check(obj)) { + PyErr_SetString(PyExc_TypeError, "Expected a string object."); + return ""; + } + return PyUnicode_AsUTF8(obj); +} + +PyObject* string_to_pyobj(const std::string& str) { + return PyUnicode_FromString(str.c_str()); +} + +PyObject* vector_to_pylist(const std::vector& vec) { + PyObject* list = PyList_New(vec.size()); + for (size_t i = 0; i < vec.size(); i++) { + PyList_SetItem(list, i, PyLong_FromLong(vec[i])); + } + return list; +} \ No newline at end of file diff --git a/pydatastructs/utils/_backend/cpp/string.hpp b/pydatastructs/utils/_backend/cpp/string.hpp new file mode 100644 index 000000000..d98449bb2 --- /dev/null +++ b/pydatastructs/utils/_backend/cpp/string.hpp @@ -0,0 +1,17 @@ +#ifndef STRINGS_UTILS_HPP +#define STRINGS_UTILS_HPP + +#include +#include +#include + +// Convert Python string to C++ string +std::string pyobj_to_string(PyObject* obj); + +// Convert C++ string to Python string +PyObject* string_to_pyobj(const std::string& str); + +// Convert C++ vector to Python list +PyObject* vector_to_pylist(const std::vector& vec); + +#endif \ No newline at end of file diff --git a/setup.py b/setup.py index 60c4ec36d..3d253e672 100644 --- a/setup.py +++ b/setup.py @@ -3,6 +3,7 @@ from pydatastructs import linear_data_structures from pydatastructs import miscellaneous_data_structures from pydatastructs import trees +from pydatastructs import strings with open("README.md", "r") as fh: long_description = fh.read() @@ -13,6 +14,7 @@ extensions.extend(linear_data_structures._extensions.extensions) extensions.extend(miscellaneous_data_structures._extensions.extensions) extensions.extend(trees._extensions.extensions) +extensions.extend(strings._extensions.extensions) setuptools.setup( name="cz-pydatastructs",