Skip to content

Commit 4acf8c6

Browse files
committed
some garbage code I wrote
1 parent 779febf commit 4acf8c6

File tree

1 file changed

+119
-0
lines changed

1 file changed

+119
-0
lines changed

pdf_update_checker/main.py

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
from __future__ import annotations
2+
from typing import List
3+
from bs4 import BeautifulSoup
4+
5+
import requests
6+
7+
8+
class PDF_Update:
9+
def __init__(self, url: str, base_url: str, output_file: str):
10+
self.base_url: str = base_url
11+
self.web_urls: List[str] = []
12+
self.urls: List[str] = []
13+
self.content: str
14+
self.output_file: str = output_file
15+
16+
# def read_web(self) -> PDF_Update:
17+
# self.get_content_web()
18+
# return self
19+
20+
def read_file(self):
21+
self.urls = self.get_content(self.output_file).split("\n")
22+
return self
23+
24+
def url_id(self, url: str, pattern_start: str, pattern_end: str) -> str:
25+
return url.split(pattern_start)[1].split(pattern_end)[0]
26+
27+
def write(self):
28+
with open(self.output_file, "w") as file:
29+
for url in self.urls:
30+
file.write(url+"\n")
31+
file.close()
32+
return self
33+
34+
def write_web(self):
35+
with open(self.output_file, "w") as file:
36+
for url in self.web_urls:
37+
file.write(url+"\n")
38+
file.close()
39+
self.urls = self.web_urls
40+
return self
41+
42+
def get_content(self, file_name: str) -> str:
43+
with open(file_name) as file:
44+
content = file.read()
45+
file.close()
46+
self.content = content
47+
return content
48+
49+
def get_content_web(self) -> PDF_Update:
50+
# url = "https://www.cl.cam.ac.uk/~rja14/book.html"
51+
# page = requests.get(self.url)
52+
# if page.status_code != 200:
53+
# raise Exception("Oops. URL threw non 200 code.")
54+
# soup = BeautifulSoup(page.text, 'html.parser')
55+
soup = BeautifulSoup(self.get_content('source.txt'), 'html.parser')
56+
urls = []
57+
for link in soup.find_all('a'):
58+
current_url: str = link.get('href')
59+
if current_url.casefold().endswith('.pdf'):
60+
urls.append(self.base_url + current_url)
61+
self.web_urls = urls
62+
return self
63+
64+
def filter(self, filter: str) -> PDF_Update:
65+
filter_urls: List[str] = []
66+
for url in self.urls:
67+
if filter.casefold() in url.casefold():
68+
filter_urls.append(url)
69+
self.urls = filter_urls
70+
filter_urls = []
71+
if len(self.web_urls) == 0:
72+
self.get_content_web()
73+
for url in self.web_urls:
74+
if filter.casefold() in url.casefold():
75+
filter_urls.append(url)
76+
self.web_urls = filter_urls
77+
return self
78+
79+
def __str__(self):
80+
return "\n".join(self.urls)
81+
82+
def prompt_yn(self, msg: str):
83+
msg += "? Enter y or n: "
84+
if input(msg) == "y":
85+
return True
86+
else:
87+
return False
88+
89+
def compare_urls_with_web(self, filter: str):
90+
self.get_content_web()
91+
self.filter(filter)
92+
if not len(self.web_urls) == len(self.urls):
93+
if self.prompt_yn("url count mismatch: update local"):
94+
self.urls = self.web_urls
95+
self.write()
96+
return self
97+
for i in range(0, len(self.urls)):
98+
if not self.urls[i] == self.web_urls[i]:
99+
if self.prompt_yn("url mismatch: update local"):
100+
print("Web url: "+self.web_urls[i])
101+
print("Local url: "+self.urls[i])
102+
self.urls[i] = self.web_urls[i]
103+
self.write()
104+
return self
105+
106+
107+
if __name__ == "__main__":
108+
url = "https://www.cl.cam.ac.uk/~rja14/book.html"
109+
base_url = "https://www.cl.cam.ac.uk/~rja14/"
110+
pdfs = PDF_Update(
111+
url, base_url, "url.txt").read_file().filter(
112+
"SEv3")
113+
pdfs.compare_urls_with_web("SEv3")
114+
# pdfs.compare_urls_with_web("SEv3")
115+
116+
# print(pdfs)
117+
# print(pdfs.url_id(pdfs.urls[10], "-ch", "-"))
118+
119+
# print(get_content())

0 commit comments

Comments
 (0)