-
Notifications
You must be signed in to change notification settings - Fork 41
/
Copy pathwebcloner.py
69 lines (58 loc) Β· 2.4 KB
/
webcloner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import os
import sys
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
class CloneWebsite:
def __init__(self, website_url):
self.website_url = website_url
self.domain_name = urlparse(website_url).netloc
self.visited_urls = set()
def get_full_url(self, path):
return urljoin(self.website_url, path)
def valid_url(self, url):
return urlparse(url).netloc == self.domain_name
def save_content(self, url, path):
try:
response = requests.get(url, stream=True)
if response.status_code == 200:
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
except Exception as e:
print(f"Error saving {url}: {e}")
def crawl_website(self, url=None):
if url is None:
url = self.website_url
if url in self.visited_urls:
return
self.visited_urls.add(url)
try:
response = requests.get(url)
if response.status_code != 200:
return
except Exception as e:
print(f"Error accessing {url}: {e}")
return
soup = BeautifulSoup(response.text, 'html.parser')
# Save the current page
path = urlparse(url).path
if not path.endswith('.html'):
path = os.path.join(path, 'index.html')
self.save_content(url, os.path.join(self.domain_name, path.lstrip('/')))
# Extract and save all linked resources
for tag, attribute in [('img', 'src'), ('script', 'src'), ('link', 'href'), ('a', 'href')]:
for resource in soup.find_all(tag):
if attribute in resource.attrs:
resource_url = self.get_full_url(resource[attribute])
if self.valid_url(resource_url):
file_path = os.path.join(self.domain_name, urlparse(resource_url).path.lstrip('/'))
if resource_url.endswith('.html'):
self.crawl_website(resource_url)
else:
self.save_content(resource_url, file_path)
if __name__ == "__main__":
website_url = sys.argv[1]
clone = CloneWebsite(website_url)
clone.crawl_website()