@@ -43,31 +43,34 @@ Crowler::~Crowler()
4343 threadsPool_.clear ();
4444}
4545
46- void Crowler::processUrl (std::string url , short depth)
46+ void Crowler::processUrl (std::string domain, std::string path , short depth)
4747{
4848 // полный процессинг ресурса: получение слов, сохранение а базу данных, получение внутренних ресурсов
49- std::string html = download (url);
49+ std::string html = download (domain, path);
50+ std::cout << html << std::endl;
5051 std::vector<std::string> words = getWords (html);
5152 std::vector<std::string> subUrls = getSubUrls (html);
53+ for (auto & i : subUrls ) {
54+ std::cout << i << std::endl;
55+ }
56+ std::string url = domain + path;
5257 savePresencesToDb (words, url);
5358
5459 // если глубина не 1, обход внутренних ресурсов с уменьшенной на 1 глубиной
5560 // если рекурсивно (или сразу) попали сюда с глубиной 1, дальнейшего обхода не будет
56- if (depth != 1 ) {
57- depth--;
58- for (auto & subUrl : subUrls) {
59- processUrl (subUrl, depth);
60- }
61- }
61+ // if (depth != 1) {
62+ // depth--;
63+ // for (auto& subUrl : subUrls) {
64+ // processUrl(subUrl, depth);
65+ // }
66+ // }
6267}
6368
64- std::string Crowler::download (std::string url )
69+ std::string Crowler::download (std::string domain, std::string path )
6570{
6671 try
6772 {
68- std::string const host = url;
6973 std::string const port = " 443" ;
70- std::string const target = " /" ;
7174 int const version = 11 ;
7275
7376 // The io_context is required for all I/O
@@ -87,12 +90,12 @@ std::string Crowler::download(std::string url)
8790 beast::ssl_stream<beast::tcp_stream> stream (ioc, ctx);
8891
8992 // Look up the domain name
90- auto const results = resolver.resolve (host , port);
93+ auto const results = resolver.resolve (domain , port);
9194
9295 // Make the connection on the IP address we get from a lookup
9396 beast::get_lowest_layer (stream).connect (results);
9497
95- if (!SSL_set_tlsext_host_name (stream.native_handle (), host .c_str ()))
98+ if (!SSL_set_tlsext_host_name (stream.native_handle (), domain .c_str ()))
9699 {
97100 boost::system::error_code ec{ static_cast <int >(::ERR_get_error ()), boost::asio::error::get_ssl_category () };
98101 throw boost::system::system_error{ ec };
@@ -102,8 +105,8 @@ std::string Crowler::download(std::string url)
102105 stream.handshake (ssl::stream_base::client);
103106
104107 // Set up an HTTP GET request message
105- http::request<http::string_body> req{http::verb::get, target , version};
106- req.set (http::field::host, host );
108+ http::request<http::string_body> req{http::verb::get, path , version};
109+ req.set (http::field::host, domain );
107110 req.set (http::field::user_agent, BOOST_BEAST_VERSION_STRING);
108111
109112 // Send the HTTP request to the remote host
@@ -124,14 +127,16 @@ std::string Crowler::download(std::string url)
124127 switch (res.base ().result_int ()) {
125128 case 301 :
126129 std::cout << " Redirecting.....\n " ;
127- download (res.base ()[" Location" ]);// .to_string());
130+ download (res.base ()[" Location" ], " " );// .to_string());
128131 break ;
129132 case 200 :
133+ // default:
130134 strBody = boost::beast::buffers_to_string (res.body ().data ());
131135 stream.shutdown (ec);
132136 break ;
133137 default :
134138 std::cout << " Unexpected HTTP status " << res.result_int () << " \n " ;
139+ std::cout << domain + path << res.result_int () << " \n " ;
135140 break ;
136141 }
137142 return strBody;
@@ -157,7 +162,7 @@ std::vector<std::string> Crowler::getDataFromHtml(std::string s, std::regex filt
157162 for (std::sregex_iterator i = words_begin; i != words_end; ++i) {
158163 std::smatch match = *i;
159164 std::string match_without_prefix = std::regex_replace (match.str (), remove_prefix, " " );
160- std::string match_str = std::regex_replace (match. str () , remove_suffix, " " );
165+ std::string match_str = std::regex_replace (match_without_prefix , remove_suffix, " " );
161166 result.push_back (match_str);
162167 }
163168 return result;
@@ -200,14 +205,13 @@ void Crowler::processStartPage()
200205 IniParser parser (CONFIG_PATH);
201206 std::string domain = parser.get_value <std::string>(" Crowler.startPageDomain" );
202207 std::string path = parser.get_value <std::string>(" Crowler.startPagePath" );
203- std::string url = domain + path;
204208 unsigned short depth = parser.get_value <unsigned short >(" Crowler.recursionDepth" );
205- addToCrowlingQueue (url , depth);
209+ addToCrowlingQueue (domain, path , depth);
206210}
207211
208- void Crowler::addToCrowlingQueue (std::string url , unsigned short depth)
212+ void Crowler::addToCrowlingQueue (std::string domain, std::string path , unsigned short depth)
209213{
210- UrlCrowlingTask task = {url , depth};
214+ UrlCrowlingTask task = {domain, path , depth};
211215 tasksQueue_.push (task);
212216}
213217
@@ -217,7 +221,7 @@ void Crowler::work() {
217221 // если в очереди задач есть задачи, вынимаем одну и выполняем
218222 UrlCrowlingTask task;
219223 tasksQueue_.pop (task);
220- processUrl (task.url , task.depth );
224+ processUrl (task.domain , task. path , task.depth );
221225 } else {
222226 // иначе передаем управление другому потоку
223227 std::this_thread::yield ();
0 commit comments