Skip to content

Commit bff643b

Browse files
committed
bugfixes
1 parent ed67432 commit bff643b

File tree

5 files changed

+108
-95
lines changed

5 files changed

+108
-95
lines changed

Crowler.cpp

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,19 @@ Crowler::~Crowler()
4747
void Crowler::processUrl(std::string domain, std::string path, short depth)
4848
{
4949
// полный процессинг ресурса: получение слов, сохранение а базу данных, получение внутренних ресурсов
50+
std::cout << "Processing sub url " << domain << path << "\n";
5051
std::string html = download(domain, path);
51-
// std::cout << html << std::endl; // test
52+
// std::cout << "\n\n\n\n===================================\n\n\n\n";
53+
// std::cout << "HTML in processUrl func:\n" << html << std::endl; // test
5254
std::vector<std::string> words = getWords(html);
5355
std::vector<std::string> subUrls = getSubUrls(html);
56+
57+
// std::cout << "Got words:\n";
58+
// for (auto& i : words ) {
59+
// std::cout << i << std::endl;
60+
// }
61+
62+
// std::cout << "Got sub urls:\n";
5463
// for (auto& i : subUrls ) {
5564
// std::cout << i << std::endl;
5665
// }
@@ -76,6 +85,7 @@ std::string Crowler::download(std::string domain, std::string path)
7685
{
7786
try
7887
{
88+
std::cout << "Crowling from " << domain << " / " << path << "...\n";
7989
std::string const port = "443";
8090
int const version = 11;
8191

@@ -129,14 +139,18 @@ std::string Crowler::download(std::string domain, std::string path)
129139

130140
std::string strBody;
131141
beast::error_code ec;
142+
std::string newLocation;
143+
std::pair<std::string, std::string> newPair;
144+
std::cout << "http status is " << res.base().result_int() << "\n";
132145

133146
switch(res.base().result_int()) {
134147
case 301:
135148
std::cout << "Redirecting.....\n";
136-
download(res.base()["Location"], "");//.to_string());
149+
newLocation = res.base()["Location"];
150+
newPair = parseSubUrl(domain, newLocation);
151+
strBody = download(newPair.first, "/" + newPair.second);
137152
break;
138153
case 200:
139-
// default:
140154
strBody = boost::beast::buffers_to_string(res.body().data());
141155
stream.shutdown(ec);
142156
break;
@@ -146,6 +160,7 @@ std::string Crowler::download(std::string domain, std::string path)
146160
std::cout << path << "\n";
147161
break;
148162
}
163+
// std::cout << "HTML in download func:\n" << strBody << "\n";
149164
return strBody;
150165

151166

DbManager.cpp

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -243,39 +243,50 @@ std::vector<int> DbManager::getUrlsIdsByWords(std::vector<std::string> words)
243243
std::vector<int> urlIds;
244244
std::vector<int> urlIdsAccepted;
245245
std::vector<int> word_ids = getWordsIds(words);
246+
246247
pqxx::work tx{ *conn };
247248
for (auto [urlIdd] : tx.query<int>("select url_id from frequencies " "where word_id in (" + getStringFromVector(word_ids) + ");")) {
248249
urlIds.push_back(urlIdd);
249250
}
251+
// for (auto& i : urlIds) {
252+
// std::cout << i << "\n";
253+
// }
250254

251255
// фильтрация ответа по тем ресурсам, которых вернулось не меньше, чем слов
252256
for (auto pair : adjacent_count(urlIds)) {
253257
int urlId = pair.first;
254258
int count = pair.second;
259+
// std::cout << urlId << "\n";
260+
// std::cout << count << "\n";
261+
// std::cout << words.size() << "\n";
255262
if (words.size() == count) {
256263
urlIdsAccepted.push_back(urlId);
257264
}
258265
}
266+
// for (auto& i : urlIdsAccepted) {
267+
// std::cout << i << "\n";
268+
// }
259269
return urlIdsAccepted;
260270
}
261271

262272
std::vector<std::string> DbManager::getSortedUrlsByWords(std::vector<std::string> words)
263273
{
264274
// получение списка ресурсов, сразу отсортированного по сумме вхождений слов
265-
275+
std::vector<std::string> sortedUrls;
266276
std::vector<int> url_ids = getUrlsIdsByWords(words);
267277
std::vector<int> word_ids = getWordsIds(words);
268278

269-
std::vector<std::string> sortedUrls;
270-
pqxx::work tx{ *conn };
271-
for (auto& [url, freq] : tx.query<std::string, int>("select u.url, sum(f.frequency) sum_freq from frequencies f "
272-
"join words w on f.word_id = w.id "
273-
"join urls u on f.url_id = u.id "
274-
"where word_id in (" + getStringFromVector(word_ids) + ") "
275-
"and url_id in (" + getStringFromVector(url_ids) + ") "
276-
"group by url "
277-
"order by sum_freq DESC")) {
278-
sortedUrls.push_back(url);
279+
if (url_ids.size() > 0 and word_ids.size() > 0) {
280+
pqxx::work tx{ *conn };
281+
for (auto& [url, freq] : tx.query<std::string, int>("select u.url, sum(f.frequency) sum_freq from frequencies f "
282+
"join words w on f.word_id = w.id "
283+
"join urls u on f.url_id = u.id "
284+
"where word_id in (" + getStringFromVector(word_ids) + ") "
285+
"and url_id in (" + getStringFromVector(url_ids) + ") "
286+
"group by url "
287+
"order by sum_freq DESC")) {
288+
sortedUrls.push_back(url);
289+
}
279290
}
280291
return sortedUrls;
281292
}

config.ini

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@ userName=searcher
66
password=searcher
77
[Crowler]
88
startPageDomain=wiki.openssl.org
9-
startPagePath=/index.php/EVP
9+
startPagePath=/
1010
recursionDepth=2
1111
[Searcher]
12-
serverPort=8080
12+
serverIp=0.0.0.0
13+
serverPort=9093

http_server_small.cpp

Lines changed: 50 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@
2525
#include <iostream>
2626
#include <memory>
2727
#include <string>
28-
#include "Searcher.h""
28+
#include "Searcher.h"
29+
#include "IniParser.h"
2930

3031
namespace beast = boost::beast; // from <boost/beast.hpp>
3132
namespace http = beast::http; // from <boost/beast/http.hpp>
@@ -140,44 +141,16 @@ class http_connection : public std::enable_shared_from_this<http_connection>
140141
void
141142
create_response()
142143
{
143-
if(request_.target() == "/count")
144-
{
145-
response_.set(http::field::content_type, "text/html");
146-
beast::ostream(response_.body())
147-
<< "<html>\n"
148-
<< "<head><title>Request count</title></head>\n"
149-
<< "<body>\n"
150-
<< "<h1>Request count</h1>\n"
151-
<< "<p>There have been "
152-
<< my_program_state::request_count()
153-
<< " requests so far.</p>\n"
154-
<< "</body>\n"
155-
<< "</html>\n";
156-
}
157-
else if(request_.target() == "/time")
158-
{
159-
response_.set(http::field::content_type, "text/html");
160-
beast::ostream(response_.body())
161-
<< "<html>\n"
162-
<< "<head><title>Current time</title></head>\n"
163-
<< "<body>\n"
164-
<< "<h1>Current time</h1>\n"
165-
<< "<p>The current time is "
166-
<< my_program_state::now()
167-
<< " seconds since the epoch.</p>\n"
168-
<< "</body>\n"
169-
<< "</html>\n";
170-
}
171-
else if(request_.target() == "/search")
144+
if(request_.target() == "/")
172145
{
173146
response_.set(http::field::content_type, "text/html");
174147
beast::ostream(response_.body())
175148
<< "<!DOCTYPE html>\n"
176149
<< "<html>\n"
177150
<< "<body>\n"
178-
<< "<form action=\"/search\" method=\"post\">\n"
151+
<< "<form action=\"/\" method=\"post\">\n"
179152
<< "<label for=\"fname\">Search request:</label><br>\n"
180-
<< "<input type=\"text\" id=\"request\" name=\"request\" value=\"what?\"><br><br>\n"
153+
<< "<input type=\"text\" id=\"request\" name=\"request\" value=\"than\"><br><br>\n"
181154
<< "<input type=\"submit\" value=\"Submit\">\n"
182155
<< "</form>\n"
183156
<< "</body>\n"
@@ -225,25 +198,27 @@ class http_connection : public std::enable_shared_from_this<http_connection>
225198
std::cout << e.what() << std::endl;
226199
}
227200

228-
// преобразование списка ресурсов к HTML-формату
229-
std::string hrefsStr = getHrefListStringFromVector(urls);
230-
231-
// формирование ответа
232-
response_.set(http::field::content_type, "text/html");
233-
beast::ostream(response_.body())
234-
<< "<!DOCTYPE html>\n"
235-
<< "<html>\n"
236-
<< "<body>\n"
237-
<< "<form action=\"/search\" method=\"post\">\n"
238-
<< "<label for=\"fname\">Search request:</label><br>\n"
239-
<< "<input type=\"text\" id=\"request\" name=\"request\" value=\"what?\"><br><br>\n"
240-
<< "<input type=\"submit\" value=\"Submit\">\n"
241-
<< "</form>\n"
242-
<< "<p>\n"
243-
<< hrefsStr
244-
<< "</p>\n"
245-
<< "</body>\n"
246-
<< "</html>\n";
201+
if (urls.size() > 0) {
202+
// преобразование списка ресурсов к HTML-формату
203+
std::string hrefsStr = getHrefListStringFromVector(urls);
204+
205+
// формирование ответа
206+
response_.set(http::field::content_type, "text/html");
207+
beast::ostream(response_.body())
208+
<< "<!DOCTYPE html>\n"
209+
<< "<html>\n"
210+
<< "<body>\n"
211+
<< "<form action=\"/search\" method=\"post\">\n"
212+
<< "<label for=\"fname\">Search request:</label><br>\n"
213+
<< "<input type=\"text\" id=\"request\" name=\"request\" value=\"than\"><br><br>\n"
214+
<< "<input type=\"submit\" value=\"Submit\">\n"
215+
<< "</form>\n"
216+
<< "<p>\n"
217+
<< hrefsStr
218+
<< "</p>\n"
219+
<< "</body>\n"
220+
<< "</html>\n";
221+
}
247222
}
248223

249224
// Asynchronously transmit the response message.
@@ -285,25 +260,32 @@ class http_connection : public std::enable_shared_from_this<http_connection>
285260
{
286261
// преобразование вектора ресурсов к их перечислению в формате HTML
287262

288-
std::string line = "";
289-
auto it = urlsVector.begin();
290-
std::string val = *it;
291-
line += "<a href=\">";
292-
line += (val);
293-
line += "\">";
294-
line += (val);
295-
line += "</a>";
263+
std::string line = "No results";
264+
if (urlsVector.size() > 0) {
296265

297-
while(it != urlsVector.end() - 1)
298-
{
299-
++it;
300-
line += "<br>";
266+
std::string line = "";
267+
auto it = urlsVector.begin();
301268
std::string val = *it;
302-
line += "<a href=\">";
269+
val = "https://" + val;
270+
line += "<a href=\"";
303271
line += (val);
304272
line += "\">";
305273
line += (val);
306274
line += "</a>";
275+
276+
while(it != urlsVector.end() - 1)
277+
{
278+
++it;
279+
line += "<br>";
280+
std::string val = *it;
281+
val = "https://" + val;
282+
line += "<a href=\"";
283+
line += (val);
284+
line += "\">";
285+
line += (val);
286+
line += "</a>";
287+
}
288+
return line;
307289
}
308290
return line;
309291
}
@@ -327,19 +309,10 @@ main(int argc, char* argv[])
327309
{
328310
try
329311
{
330-
// Check command line arguments.
331-
if(argc != 3)
332-
{
333-
std::cerr << "Usage: " << argv[0] << " <address> <port>\n";
334-
std::cerr << " For IPv4, try:\n";
335-
std::cerr << " receiver 0.0.0.0 80\n";
336-
std::cerr << " For IPv6, try:\n";
337-
std::cerr << " receiver 0::0 80\n";
338-
return EXIT_FAILURE;
339-
}
340-
341-
auto const address = net::ip::make_address(argv[1]);
342-
unsigned short port = static_cast<unsigned short>(std::atoi(argv[2]));
312+
IniParser parser(CONFIG_PATH);
313+
std::string serverIp = parser.get_value<std::string>("Searcher.serverIp");
314+
auto const address = net::ip::make_address(serverIp);
315+
unsigned short port = parser.get_value<unsigned short>("Searcher.serverPort");
343316

344317
net::io_context ioc{1};
345318

main.cpp

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,16 @@
33
#include <vector>
44
#include "Crowler.h"
55
#include "Searcher.h"
6+
#include "DbManager.h"
67

78

89

910
int main()
1011
{
11-
// Тестирование
12-
1312
Crowler crowler = Crowler();
1413
crowler.processStartPage();
14+
15+
// Тестирование
1516
// std::string res = crowler.download("litres.ru");
1617
// std::cout << res << std::endl;
1718
// std::vector<std::string> words = crowler.getWords(res);
@@ -73,5 +74,17 @@ int main()
7374
// std::pair<std::string, std::string> test = parseSubUrl("www.ya.ru", subUrl);
7475
// std::cout << test.first << std::endl;
7576
// std::cout << test.second << std::endl;
77+
78+
// DbManager dbManager = DbManager();
79+
// Searcher searcher = Searcher();
80+
// std::vector<std::string> testWords;
81+
// testWords.push_back("what");
82+
// std::vector<std::string> res = searcher.processSearchRequest(testWords);
83+
// // std::vector<std::string> res = dbManager.getSortedUrlsByWords(testWords);
84+
// std::cout << res.size() << "\n";
85+
// for (auto& i : res) {
86+
// std::cout << i << std::endl;
87+
// }
88+
7689
return 0;
7790
}

0 commit comments

Comments
 (0)