1414#include < iostream>
1515#include < string>
1616#include < unordered_map>
17+ #include < boost/algorithm/string.hpp>
1718#include " DbManager.h"
1819#include " IniParser.h"
1920
@@ -58,12 +59,17 @@ void Crowler::processUrl(std::string domain, std::string path, short depth)
5859
5960 // если глубина не 1, обход внутренних ресурсов с уменьшенной на 1 глубиной
6061 // если рекурсивно (или сразу) попали сюда с глубиной 1, дальнейшего обхода не будет
61- // if (depth != 1) {
62- // depth--;
63- // for (auto& subUrl : subUrls) {
64- // processUrl(subUrl, depth);
65- // }
66- // }
62+ if (depth != 1 ) {
63+ depth--;
64+ for (auto & subUrl : subUrls) {
65+ if (subUrl.size () > 1 ) {
66+ std::pair<std::string, std::string> domain_path = parseSubUrl (domain, subUrl);
67+ std::string domain = domain_path.first ;
68+ std::string path = domain_path.second ;
69+ processUrl (domain, path, depth);
70+ }
71+ }
72+ }
6773}
6874
6975std::string Crowler::download (std::string domain, std::string path)
@@ -136,7 +142,8 @@ std::string Crowler::download(std::string domain, std::string path)
136142 break ;
137143 default :
138144 std::cout << " Unexpected HTTP status " << res.result_int () << " \n " ;
139- std::cout << domain + path << res.result_int () << " \n " ;
145+ std::cout << domain << " \n " ;
146+ std::cout << path << " \n " ;
140147 break ;
141148 }
142149 return strBody;
@@ -156,8 +163,8 @@ std::vector<std::string> Crowler::getDataFromHtml(std::string s, std::regex filt
156163
157164 auto words_begin = std::sregex_iterator (s.begin (), s.end (), filter);
158165 auto words_end = std::sregex_iterator ();
159- std::regex remove_prefix (" <a href=" );
160- std::regex remove_suffix (" # .*" );
166+ std::regex remove_prefix (" <a href=\" " );
167+ std::regex remove_suffix (" [# \" ] .*" );
161168
162169 for (std::sregex_iterator i = words_begin; i != words_end; ++i) {
163170 std::smatch match = *i;
@@ -228,3 +235,23 @@ void Crowler::work() {
228235 }
229236 }
230237}
238+
239+ std::pair<std::string, std::string> Crowler::parseSubUrl (std::string domain, std::string subUrl) {
240+
241+ std::string path;
242+
243+ if (subUrl.find (" http" ) != std::string::npos) {
244+ std::vector<std::string> parts;
245+ boost::split (parts, subUrl, boost::is_any_of (" /" ));
246+ domain = parts[2 ];
247+ path = parts[3 ];
248+ for (size_t i=4 ; i<parts.size (); ++i) {
249+ path += " /" ;
250+ path += parts[i];
251+ }
252+ } else {
253+ path = subUrl;
254+ }
255+ return std::pair (domain, path);
256+
257+ }
0 commit comments