-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhref_checker.py
111 lines (101 loc) · 3.55 KB
/
href_checker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from bs4 import BeautifulSoup
import glob
import os
import requests
import sys
def get_all_html():
corvette_htmls = glob.glob("dist/assets/**/*.html", recursive=True)
return [g for g in glob.glob("dist/**/*.html", recursive=True) if g not in corvette_htmls]
def get_all_assets():
return [g for g in glob.glob("dist/assets/**/*", recursive=True) if "." in g]
def soup_up(page):
f = open(page, "r")
html = f.read()
f.close()
return BeautifulSoup(html, 'html.parser')
def is_internal(h):
if not h:
return False
if h[0] == "/":
return True
return False
def is_external(h):
if h[:4] == "http":
return True
return False
def clean_internal(h):
if "?" in h:
h = h.split("?")[0]
if h[-5:] == ".html":
pass
elif h[-4:] == ".pdf":
pass
elif h[-1] == "/":
h = h + "index.html"
else:
h = h + "/index.html"
return "dist" + h
def check_internal_hrefs():
print("---Checking Internal Links---")
pages = get_all_html()
assets = get_all_assets()
for page in pages:
soup = soup_up(page)
links = [clean_internal(h.get("href")) for h in soup.find_all('a') if is_internal(h.get("href"))]
for link in links:
if link not in pages and link not in assets:
print("{} IN {}".format(link, page))
print("---Done---")
def check_external_hrefs():
print("---Checking External Links---")
pages = get_all_html()
checked = []
for page in pages:
soup = soup_up(page)
links = [h.get("href") for h in soup.find_all("a") if is_external(h.get("href")) and "mailto" not in h.get("href")]
for link in links:
if link not in checked:
try:
resp = requests.head(link)
if int(resp.status_code) == 404:
resp = requests.get(link)
if int(resp.status_code) == 404: # Could check other codes but complicates scraper vs browser for permissions
print("NOT FOUND: {} IN {}".format(link, page))
except:
print("ERROR DURING {} IN {}".format(link, page))
checked.append(link)
print("---Done---")
def check_asset_srcs():
print("---Checking Asset Links---")
pages = get_all_html()
assets = get_all_assets()
all_links = []
for page in pages:
soup = soup_up(page)
images = ["dist" + s.get("src") for s in soup.find_all("img") if is_internal(s.get("src"))]
styles = ["dist" + s.get("src") for s in soup.find_all("script") if is_internal(s.get("src"))]
scripts = ["dist" + s.get("href") for s in soup.find_all("link") if is_internal(s.get("href"))]
for link in images + styles + scripts:
all_links.append(link)
if link not in assets:
print("{} IN {}".format(link, page))
print("---Done---")
print("---Checking Asset Usage---")
for asset in assets:
if asset not in all_links and asset[-5:] != ".html" and asset[-4:] != ".pdf":
print("UNUSED: {}".format(asset))
print("---Done---")
if __name__=="__main__":
if len(sys.argv) == 2:
if sys.argv[1] == "--internal":
check_internal_hrefs()
elif sys.argv[1] == "--external":
check_external_hrefs()
elif sys.argv[1] == "--assets":
check_asset_srcs()
else:
print("Unrecognized flag")
else:
check_internal_hrefs()
check_external_hrefs()
check_asset_srcs()