recursive downloading

tkvitko · tkvitko · commit 4bd3a2cd62cd · 2025-01-23T14:32:11.000+03:00
diff --git a/Crowler.cpp b/Crowler.cpp
@@ -14,6 +14,7 @@
 #include <iostream>
 #include <string>
 #include <unordered_map>
+#include <boost/algorithm/string.hpp>
 #include "DbManager.h"
 #include "IniParser.h"
 
@@ -58,12 +59,17 @@ void Crowler::processUrl(std::string domain, std::string path, short depth)
 
     // если глубина не 1, обход внутренних ресурсов с уменьшенной на 1 глубиной
     // если рекурсивно (или сразу) попали сюда с глубиной 1, дальнейшего обхода не будет
-    // if (depth != 1) {
-    //     depth--;
-    //     for (auto& subUrl : subUrls) {
-    //         processUrl(subUrl, depth);
-    //     }
-    // }
+    if (depth != 1) {
+        depth--;
+        for (auto& subUrl : subUrls) {
+            if (subUrl.size() > 1) {
+                std::pair<std::string, std::string> domain_path = parseSubUrl(domain, subUrl);
+                std::string domain = domain_path.first;
+                std::string path = domain_path.second;
+                processUrl(domain, path, depth);
+            }
+        }
+    }
 }
 
 std::string Crowler::download(std::string domain, std::string path)
@@ -136,7 +142,8 @@ std::string Crowler::download(std::string domain, std::string path)
                 break;
             default:
                 std::cout << "Unexpected HTTP status " << res.result_int() << "\n";
-                std::cout << domain + path << res.result_int() << "\n";
+                std::cout << domain << "\n";
+                std::cout << path << "\n";
                 break;
         }
         return strBody;
@@ -156,8 +163,8 @@ std::vector<std::string> Crowler::getDataFromHtml(std::string s, std::regex filt
 
     auto words_begin = std::sregex_iterator(s.begin(), s.end(), filter);
     auto words_end = std::sregex_iterator();
-    std::regex remove_prefix("<a href=");
-    std::regex remove_suffix("#.*");
+    std::regex remove_prefix("<a href=\"");
+    std::regex remove_suffix("[#\"].*");
 
     for (std::sregex_iterator i = words_begin; i != words_end; ++i) {
         std::smatch match = *i;
@@ -228,3 +235,23 @@ void Crowler::work() {
         }
     }
 }
+
+std::pair<std::string, std::string> Crowler::parseSubUrl(std::string domain, std::string subUrl) {
+
+    std::string path;
+
+    if (subUrl.find("http") != std::string::npos) {
+        std::vector<std::string> parts;
+        boost::split(parts, subUrl, boost::is_any_of("/"));
+        domain = parts[2];
+        path = parts[3];
+        for (size_t i=4; i<parts.size(); ++i) {
+            path += "/";
+            path += parts[i];
+        }
+    } else {
+        path = subUrl;
+    }
+    return std::pair(domain, path);
+
+}
diff --git a/Crowler.h b/Crowler.h
@@ -32,6 +32,8 @@ class Crowler
     void addToCrowlingQueue(std::string domain, std::string pat, unsigned short depth);
     // методя для взятия очередной задачи на процессинг ресурса из очереди задач и процессинга
     void work();
+    // метод для разложения внутреннего url на domain и path
+    std::pair<std::string, std::string> parseSubUrl(std::string domain, std::string subUrl);
 
 public:
     Crowler();
diff --git a/config.ini b/config.ini
@@ -7,6 +7,6 @@ password=searcher
 [Crowler]
 startPageDomain=wiki.openssl.org
 startPagePath=/index.php/EVP
-recursionDepth=1
+recursionDepth=2
 [Searcher]
 serverPort=8080
diff --git a/main.cpp b/main.cpp
@@ -5,6 +5,7 @@
 #include "Searcher.h"
 
 
+
 int main()
 {
     // Тестирование
@@ -68,5 +69,9 @@ int main()
     //     std::cout << i << std::endl;
     // }
 
+    // std::string subUrl = "http://google.com/index.php/Special:MyContributions";
+    // std::pair<std::string, std::string> test = parseSubUrl("www.ya.ru", subUrl);
+    // std::cout << test.first << std::endl;
+    // std::cout << test.second << std::endl;
     return 0;
 }

Original file line number	Diff line number	Diff line change
`@@ -5,6 +5,7 @@`
`5`	`5`	`#include "Searcher.h"`
`6`	`6`
`7`	`7`
	`8`	`+`
`8`	`9`	`int main()`
`9`	`10`	`{`
`10`	`11`	`// Тестирование`
`@@ -68,5 +69,9 @@ int main()`
`68`	`69`	`// std::cout << i << std::endl;`
`69`	`70`	`// }`
`70`	`71`
	`72`	`+ // std::string subUrl = "http://google.com/index.php/Special:MyContributions";`
	`73`	`+ // std::pair<std::string, std::string> test = parseSubUrl("www.ya.ru", subUrl);`
	`74`	`+ // std::cout << test.first << std::endl;`
	`75`	`+ // std::cout << test.second << std::endl;`
`71`	`76`	`return 0;`
`72`	`77`	`}`