Skip to content

Commit 438da18

Browse files
authored
Merge pull request #48 from BianchTech/46-refactor-move-string-manipulation-code-to-a-utility-folder
refactor(string_operations.h/cpp): add new util operations
2 parents 32d3202 + 1dda89e commit 438da18

File tree

7 files changed

+116
-129
lines changed

7 files changed

+116
-129
lines changed

lib/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ add_library(
88
src/inverted_index.cpp
99
src/preprocessing/stemmer.cpp
1010
src/exceptions/invalid_pointer_exception.cpp
11+
src/utils/string_operations.cpp
1112
)
1213

1314
target_include_directories(search_engine PUBLIC include)

lib/include/inverted_index.h

Lines changed: 2 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
#include <string>
1010
#include <vector>
1111

12+
#include "utils/string_operations.h"
13+
1214
namespace inverted_index {
1315
/**
1416
* @typedef str
@@ -22,12 +24,6 @@ typedef std::string str;
2224
*/
2325
typedef std::list<str> list_str;
2426

25-
/**
26-
* @define DELIMITER
27-
* @brief Delimiter used to split strings.
28-
*/
29-
#define DELIMITER " "
30-
3127
/**
3228
* @struct docs
3329
* @brief Structure that stores information about a document.
@@ -107,23 +103,6 @@ typedef std::set<docs> set_docs;
107103
*/
108104
typedef std::vector<str> vector_str;
109105

110-
/**
111-
* @brief Converts a character to lowercase.
112-
* @param c Character to convert.
113-
* @return Character converted to lowercase.
114-
*/
115-
char to_lowercase(unsigned char c) {
116-
return std::tolower(c);
117-
}
118-
119-
/**
120-
* @brief Splits a string based on a delimiter.
121-
* @param s String to split.
122-
* @param delimiter Delimiter to split the string.
123-
* @return Vector of strings resulting from the split.
124-
*/
125-
vector_str split(str& s, const str& delimiter);
126-
127106
/**
128107
* @brief Adds a new document to the document map.
129108
* @param mp Map of words to lists of documents.
@@ -149,11 +128,6 @@ list_docs find_doc(map_str_docs& mp, str& word);
149128
*/
150129
list_docs find_answer(map_str_docs& mp, str& input);
151130

152-
/**
153-
* @brief Removes unwanted characters from a string, such as extra spaces.
154-
* @param input String to process.
155-
*/
156-
void shrink_string(std::string* input);
157131
} // namespace inverted_index
158132

159133
#endif // INVERTED_INDEX

lib/include/preprocessing/stemmer.h

Lines changed: 1 addition & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <vector>
1313

1414
#include "exceptions/invalid_pointer_exception.h"
15+
#include "utils/string_operations.h"
1516

1617
namespace stemmer {
1718

@@ -89,26 +90,6 @@ class RSPL {
8990
* @return `true` if the word ends with 'a', otherwise `false`.
9091
*/
9192
bool endsWithA(const std::string& word);
92-
93-
/**
94-
* @brief Splits a string into parts based on delimiters.
95-
* @param s The string to be split.
96-
* @return A vector containing the parts of the string.
97-
*/
98-
std::vector<std::string> split(std::string& s);
99-
100-
/**
101-
* @brief Removes accents from a string.
102-
* @param input The input string.
103-
* @return The string without accents.
104-
*/
105-
std::string removeAccents(const std::string& input);
106-
107-
/**
108-
* @brief Shrinks the size of a string to normalize it.
109-
* @param input Pointer to the input string.
110-
*/
111-
void shrinkString(std::string* input);
11293
};
11394

11495
} // namespace stemmer

lib/include/utils/string_operations.h

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
#include <unicode/locid.h>
2+
#include <unicode/unistr.h>
3+
#include <unicode/ustream.h>
4+
#include <codecvt>
5+
#include <locale>
6+
#include <string>
7+
#include <unordered_map>
8+
#include <vector>
9+
10+
namespace utils {
11+
12+
/**
13+
* @define DELIMITER
14+
* @brief Delimiter used to split strings.
15+
*/
16+
constexpr std::string DELIMITER = " ";
17+
18+
/**
19+
* @brief Mapping to normalize accented characters to their ASCII
20+
* equivalents.
21+
*/
22+
const std::unordered_map<wchar_t, wchar_t> accentMap_ = {
23+
{L'á', L'a'}, {L'à', L'a'}, {L'â', L'a'}, {L'ã', L'a'}, {L'ä', L'a'},
24+
{L'é', L'e'}, {L'è', L'e'}, {L'ê', L'e'}, {L'ë', L'e'}, {L'í', L'i'},
25+
{L'ì', L'i'}, {L'î', L'i'}, {L'ï', L'i'}, {L'ó', L'o'}, {L'ò', L'o'},
26+
{L'ô', L'o'}, {L'õ', L'o'}, {L'ö', L'o'}, {L'ú', L'u'}, {L'ù', L'u'},
27+
{L'û', L'u'}, {L'ü', L'u'}, {L'ç', L'c'}, {L'Á', L'A'}, {L'À', L'A'},
28+
{L'Â', L'A'}, {L'Ã', L'A'}, {L'Ä', L'A'}, {L'É', L'E'}, {L'È', L'E'},
29+
{L'Ê', L'E'}, {L'Ë', L'E'}, {L'Í', L'I'}, {L'Ì', L'I'}, {L'Î', L'I'},
30+
{L'Ï', L'I'}, {L'Ó', L'O'}, {L'Ò', L'O'}, {L'Ô', L'O'}, {L'Õ', L'O'},
31+
{L'Ö', L'O'}, {L'Ú', L'U'}, {L'Ù', L'U'}, {L'Û', L'U'}, {L'Ü', L'U'},
32+
{L'Ç', L'C'}};
33+
// Funções para manipular string
34+
35+
/**
36+
* @brief Removes unwanted characters from a string, such as extra spaces.
37+
* @param input String to process.
38+
*/
39+
void shrink_string(std::string* input);
40+
41+
/**
42+
* @brief Splits a string based on a delimiter.
43+
* @param s String to split.
44+
* @param delimiter Delimiter to split the string.
45+
* @return Vector of strings resulting from the split.
46+
*/
47+
std::vector<std::string> split(std::string& s, const std::string& delimiter);
48+
49+
/**
50+
* @brief Removes accents from a string.
51+
* @param input The input string.
52+
* @return The string without accents.
53+
*/
54+
std::string removeAccents(const std::string& input);
55+
56+
} // namespace utils

lib/src/inverted_index.cpp

Lines changed: 4 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -8,36 +8,11 @@
88

99
using namespace inverted_index;
1010

11-
vector_str inverted_index::split(str& s, const str& delimiter) {
12-
vector_str tokens;
13-
size_t pos = 0;
14-
std::string token;
15-
while ((pos = s.find(delimiter)) != std::string::npos) {
16-
token = s.substr(0, pos);
17-
tokens.push_back(token);
18-
s.erase(0, pos + delimiter.length());
19-
}
20-
tokens.push_back(s);
21-
22-
return tokens;
23-
}
24-
25-
void inverted_index::shrink_string(std::string* input) {
26-
if (!input)
27-
return; // Verifica se o ponteiro é válido
28-
29-
icu::UnicodeString ustr(input->c_str(), "UTF-8");
30-
ustr.toLower();
31-
std::string result;
32-
ustr.toUTF8String(result);
33-
*input = result;
34-
}
35-
3611
map_str_docs inverted_index::add_doc(map_str_docs& mp,
3712
const str& doc_name,
3813
str& text) {
39-
shrink_string(&text);
40-
auto words = inverted_index::split(text, DELIMITER);
14+
utils::shrink_string(&text);
15+
auto words = utils::split(text, utils::DELIMITER);
4116

4217
for (const auto& word : words) {
4318
docs target = {doc_name, 1};
@@ -61,8 +36,8 @@ list_docs inverted_index::find_answer(map_str_docs& mp, str& input) {
6136
list_docs result;
6237
set_docs unique_docs;
6338

64-
shrink_string(&input);
65-
auto words = inverted_index::split(input, DELIMITER);
39+
utils::shrink_string(&input);
40+
auto words = utils::split(input, utils::DELIMITER);
6641

6742
for (auto& word : words) {
6843
list_docs docs = inverted_index::find_doc(

lib/src/preprocessing/stemmer.cpp

Lines changed: 3 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -164,55 +164,6 @@ bool RSPL::endsWithS(const std::string& word) {
164164
return false;
165165
}
166166

167-
std::vector<std::string> RSPL::split(std::string& s) {
168-
std::vector<std::string> tokens;
169-
size_t pos = 0;
170-
std::string token;
171-
std::string delimiter = " ";
172-
while ((pos = s.find(delimiter)) != std::string::npos) {
173-
token = s.substr(0, pos);
174-
tokens.push_back(token);
175-
s.erase(0, pos + delimiter.length());
176-
}
177-
tokens.push_back(s);
178-
179-
return tokens;
180-
}
181-
182-
std::string RSPL::removeAccents(const std::string& input) {
183-
std::wstring winput =
184-
std::wstring_convert<std::codecvt_utf8<wchar_t>>().from_bytes(input);
185-
std::wstring woutput;
186-
woutput.reserve(winput.size()); // Evitar alocações desnecessárias
187-
188-
// Processar a string como wstring
189-
for (wchar_t ch : winput) {
190-
if (accentMap_.count(ch)) {
191-
woutput.push_back(accentMap_.at(ch)); // Substituir acentuados
192-
} else {
193-
woutput.push_back(ch); // Mantém o caractere não acentuado
194-
}
195-
}
196-
197-
// Converter de volta para std::string
198-
return std::wstring_convert<std::codecvt_utf8<wchar_t>>().to_bytes(woutput);
199-
}
200-
201-
void RSPL::shrinkString(std::string* input) {
202-
try {
203-
if (!input)
204-
throw exceptions::invalid_pointer_exception();
205-
206-
icu::UnicodeString ustr(input->c_str(), "UTF-8");
207-
ustr.toLower();
208-
std::string result;
209-
ustr.toUTF8String(result);
210-
*input = result;
211-
} catch (const std::exception& e) {
212-
std::cerr << e.what() << '\n';
213-
}
214-
}
215-
216167
bool RSPL::applyRules(std::string& word, const std::vector<StepRule>& rules) {
217168
for (const auto& rule : rules) {
218169
// Verificar se a palavra termina com o sufixo especificado
@@ -234,9 +185,9 @@ bool RSPL::applyRules(std::string& word, const std::vector<StepRule>& rules) {
234185

235186
void RSPL::run(std::string* sentence) {
236187
// Separar a sentença em palavras
237-
this->shrinkString(sentence);
188+
utils::shrink_string(sentence);
238189
// std::cout << *sentence << std::endl;
239-
std::vector<std::string> words = this->split(*sentence);
190+
std::vector<std::string> words = utils::split(*sentence, utils::DELIMITER);
240191

241192
for (std::string& word : words) {
242193
// PLURAL REDUCTION
@@ -271,7 +222,7 @@ void RSPL::run(std::string* sentence) {
271222
}
272223

273224
// Função para remover acentos
274-
word = removeAccents(word);
225+
word = utils::removeAccents(word);
275226
// std::cout << word << std::endl;
276227
}
277228

lib/src/utils/string_operations.cpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
#include "utils/string_operations.h"
2+
3+
namespace utils {
4+
5+
void shrink_string(std::string* input) {
6+
if (!input)
7+
return; // Verifica se o ponteiro é válido
8+
9+
icu::UnicodeString ustr(input->c_str(), "UTF-8");
10+
ustr.toLower();
11+
std::string result;
12+
ustr.toUTF8String(result);
13+
*input = result;
14+
}
15+
16+
std::vector<std::string> split(std::string& s, const std::string& delimiter) {
17+
std::vector<std::string> tokens;
18+
size_t pos = 0;
19+
std::string token;
20+
while ((pos = s.find(delimiter)) != std::string::npos) {
21+
token = s.substr(0, pos);
22+
tokens.push_back(token);
23+
s.erase(0, pos + delimiter.length());
24+
}
25+
tokens.push_back(s);
26+
27+
return tokens;
28+
}
29+
30+
std::string removeAccents(const std::string& input) {
31+
std::wstring winput =
32+
std::wstring_convert<std::codecvt_utf8<wchar_t>>().from_bytes(input);
33+
std::wstring woutput;
34+
woutput.reserve(winput.size()); // Evitar alocações desnecessárias
35+
36+
// Processar a string como wstring
37+
for (wchar_t ch : winput) {
38+
if (accentMap_.count(ch)) {
39+
woutput.push_back(accentMap_.at(ch)); // Substituir acentuados
40+
} else {
41+
woutput.push_back(ch); // Mantém o caractere não acentuado
42+
}
43+
}
44+
45+
// Converter de volta para std::string
46+
return std::wstring_convert<std::codecvt_utf8<wchar_t>>().to_bytes(woutput);
47+
}
48+
49+
} // namespace utils

0 commit comments

Comments
 (0)