Skip to content

Commit 4bd3a2c

Browse files
committed
recursive downloading
1 parent 95f116e commit 4bd3a2c

File tree

4 files changed

+44
-10
lines changed

4 files changed

+44
-10
lines changed

Diff for: Crowler.cpp

+36-9
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include <iostream>
1515
#include <string>
1616
#include <unordered_map>
17+
#include <boost/algorithm/string.hpp>
1718
#include "DbManager.h"
1819
#include "IniParser.h"
1920

@@ -58,12 +59,17 @@ void Crowler::processUrl(std::string domain, std::string path, short depth)
5859

5960
// если глубина не 1, обход внутренних ресурсов с уменьшенной на 1 глубиной
6061
// если рекурсивно (или сразу) попали сюда с глубиной 1, дальнейшего обхода не будет
61-
// if (depth != 1) {
62-
// depth--;
63-
// for (auto& subUrl : subUrls) {
64-
// processUrl(subUrl, depth);
65-
// }
66-
// }
62+
if (depth != 1) {
63+
depth--;
64+
for (auto& subUrl : subUrls) {
65+
if (subUrl.size() > 1) {
66+
std::pair<std::string, std::string> domain_path = parseSubUrl(domain, subUrl);
67+
std::string domain = domain_path.first;
68+
std::string path = domain_path.second;
69+
processUrl(domain, path, depth);
70+
}
71+
}
72+
}
6773
}
6874

6975
std::string Crowler::download(std::string domain, std::string path)
@@ -136,7 +142,8 @@ std::string Crowler::download(std::string domain, std::string path)
136142
break;
137143
default:
138144
std::cout << "Unexpected HTTP status " << res.result_int() << "\n";
139-
std::cout << domain + path << res.result_int() << "\n";
145+
std::cout << domain << "\n";
146+
std::cout << path << "\n";
140147
break;
141148
}
142149
return strBody;
@@ -156,8 +163,8 @@ std::vector<std::string> Crowler::getDataFromHtml(std::string s, std::regex filt
156163

157164
auto words_begin = std::sregex_iterator(s.begin(), s.end(), filter);
158165
auto words_end = std::sregex_iterator();
159-
std::regex remove_prefix("<a href=");
160-
std::regex remove_suffix("#.*");
166+
std::regex remove_prefix("<a href=\"");
167+
std::regex remove_suffix("[#\"].*");
161168

162169
for (std::sregex_iterator i = words_begin; i != words_end; ++i) {
163170
std::smatch match = *i;
@@ -228,3 +235,23 @@ void Crowler::work() {
228235
}
229236
}
230237
}
238+
239+
std::pair<std::string, std::string> Crowler::parseSubUrl(std::string domain, std::string subUrl) {
240+
241+
std::string path;
242+
243+
if (subUrl.find("http") != std::string::npos) {
244+
std::vector<std::string> parts;
245+
boost::split(parts, subUrl, boost::is_any_of("/"));
246+
domain = parts[2];
247+
path = parts[3];
248+
for (size_t i=4; i<parts.size(); ++i) {
249+
path += "/";
250+
path += parts[i];
251+
}
252+
} else {
253+
path = subUrl;
254+
}
255+
return std::pair(domain, path);
256+
257+
}

Diff for: Crowler.h

+2
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ class Crowler
3232
void addToCrowlingQueue(std::string domain, std::string pat, unsigned short depth);
3333
// методя для взятия очередной задачи на процессинг ресурса из очереди задач и процессинга
3434
void work();
35+
// метод для разложения внутреннего url на domain и path
36+
std::pair<std::string, std::string> parseSubUrl(std::string domain, std::string subUrl);
3537

3638
public:
3739
Crowler();

Diff for: config.ini

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,6 @@ password=searcher
77
[Crowler]
88
startPageDomain=wiki.openssl.org
99
startPagePath=/index.php/EVP
10-
recursionDepth=1
10+
recursionDepth=2
1111
[Searcher]
1212
serverPort=8080

Diff for: main.cpp

+5
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include "Searcher.h"
66

77

8+
89
int main()
910
{
1011
// Тестирование
@@ -68,5 +69,9 @@ int main()
6869
// std::cout << i << std::endl;
6970
// }
7071

72+
// std::string subUrl = "http://google.com/index.php/Special:MyContributions";
73+
// std::pair<std::string, std::string> test = parseSubUrl("www.ya.ru", subUrl);
74+
// std::cout << test.first << std::endl;
75+
// std::cout << test.second << std::endl;
7176
return 0;
7277
}

0 commit comments

Comments
 (0)