14
14
#include < iostream>
15
15
#include < string>
16
16
#include < unordered_map>
17
+ #include < boost/algorithm/string.hpp>
17
18
#include " DbManager.h"
18
19
#include " IniParser.h"
19
20
@@ -58,12 +59,17 @@ void Crowler::processUrl(std::string domain, std::string path, short depth)
58
59
59
60
// если глубина не 1, обход внутренних ресурсов с уменьшенной на 1 глубиной
60
61
// если рекурсивно (или сразу) попали сюда с глубиной 1, дальнейшего обхода не будет
61
- // if (depth != 1) {
62
- // depth--;
63
- // for (auto& subUrl : subUrls) {
64
- // processUrl(subUrl, depth);
65
- // }
66
- // }
62
+ if (depth != 1 ) {
63
+ depth--;
64
+ for (auto & subUrl : subUrls) {
65
+ if (subUrl.size () > 1 ) {
66
+ std::pair<std::string, std::string> domain_path = parseSubUrl (domain, subUrl);
67
+ std::string domain = domain_path.first ;
68
+ std::string path = domain_path.second ;
69
+ processUrl (domain, path, depth);
70
+ }
71
+ }
72
+ }
67
73
}
68
74
69
75
std::string Crowler::download (std::string domain, std::string path)
@@ -136,7 +142,8 @@ std::string Crowler::download(std::string domain, std::string path)
136
142
break ;
137
143
default :
138
144
std::cout << " Unexpected HTTP status " << res.result_int () << " \n " ;
139
- std::cout << domain + path << res.result_int () << " \n " ;
145
+ std::cout << domain << " \n " ;
146
+ std::cout << path << " \n " ;
140
147
break ;
141
148
}
142
149
return strBody;
@@ -156,8 +163,8 @@ std::vector<std::string> Crowler::getDataFromHtml(std::string s, std::regex filt
156
163
157
164
auto words_begin = std::sregex_iterator (s.begin (), s.end (), filter);
158
165
auto words_end = std::sregex_iterator ();
159
- std::regex remove_prefix (" <a href=" );
160
- std::regex remove_suffix (" # .*" );
166
+ std::regex remove_prefix (" <a href=\" " );
167
+ std::regex remove_suffix (" [# \" ] .*" );
161
168
162
169
for (std::sregex_iterator i = words_begin; i != words_end; ++i) {
163
170
std::smatch match = *i;
@@ -228,3 +235,23 @@ void Crowler::work() {
228
235
}
229
236
}
230
237
}
238
+
239
+ std::pair<std::string, std::string> Crowler::parseSubUrl (std::string domain, std::string subUrl) {
240
+
241
+ std::string path;
242
+
243
+ if (subUrl.find (" http" ) != std::string::npos) {
244
+ std::vector<std::string> parts;
245
+ boost::split (parts, subUrl, boost::is_any_of (" /" ));
246
+ domain = parts[2 ];
247
+ path = parts[3 ];
248
+ for (size_t i=4 ; i<parts.size (); ++i) {
249
+ path += " /" ;
250
+ path += parts[i];
251
+ }
252
+ } else {
253
+ path = subUrl;
254
+ }
255
+ return std::pair (domain, path);
256
+
257
+ }
0 commit comments