Skip to content

Commit 55cd09c

Browse files
committed
style: run pre-commit
1 parent e683198 commit 55cd09c

File tree

4 files changed

+123
-108
lines changed

4 files changed

+123
-108
lines changed

otodom/task_1/mk/otodom.py

Lines changed: 121 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -1,123 +1,139 @@
1-
from bs4 import BeautifulSoup
2-
import requests
3-
import re
41
import json
2+
import re
53
import sys
64

5+
import requests
6+
from bs4 import BeautifulSoup
7+
78
HEADERS = {
8-
"Access-Control-Allow-Origin": "*",
9-
"Content-Encoding": "gzip",
10-
"Content-Type": "text/html; charset=utf-8",
11-
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
12-
"Accept-Encoding": "gzip, deflate, br",
13-
"Accept-Language": "pl-PL,pl;q=0.9,en-US;q=0.8,en;q=0.7",
14-
"Sec-Ch-Ua": "\"Chromium\";v=\"118\", \"Google Chrome\";v=\"118\", \"Not=A?Brand\";v=\"99\"",
15-
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36"
9+
"Access-Control-Allow-Origin": "*",
10+
"Content-Type": "text/html; charset=utf-8",
11+
"Accept-Language": "pl-PL,pl;q=0.9,en-US;q=0.8,en;q=0.7",
12+
"Sec-Ch-Ua": '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"',
13+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36", # noqa: E501
1614
}
1715

16+
1817
class Crawler:
19-
2018
def getPage(self, url):
2119
try:
22-
req = requests.get(url, headers=HEADERS)
20+
req = requests.get(url, headers=HEADERS)
2321
except requests.exceptions.RequestException:
24-
print(f'Failed to retrieve the webpage. Status code: {req.status_code}')
25-
return None
26-
return BeautifulSoup(req.text, "html.parser")
27-
22+
print(f"Failed to retrieve the webpage. Status code: {req.status_code}")
23+
return None
24+
return BeautifulSoup(req.text, "html.parser")
25+
2826
def getLinks(self, url):
29-
bs = self.getPage(url)
30-
if bs is not None:
31-
promoted_listings = bs.find('div', {'data-cy': 'search.listing.promoted'}).find_all('a', {'class': 'css-1hfdwlm e1dfeild2'})
32-
promoted = []
33-
for link in promoted_listings:
34-
if 'href' in link.attrs:
35-
promoted.append(link.attrs['href'])
36-
37-
organic_listings = bs.find('div', {'data-cy': 'search.listing.organic'}).find_all('a', {'class': 'css-1hfdwlm e1dfeild2'})
38-
organic = []
39-
for link in organic_listings:
40-
if 'href' in link.attrs:
41-
organic.append(link.attrs['href'])
42-
43-
return (promoted, organic)
44-
return None
45-
27+
bs = self.getPage(url)
28+
if bs is not None:
29+
promoted_listings = bs.find(
30+
"div", {"data-cy": "search.listing.promoted"}
31+
).find_all("a", {"class": "css-1hfdwlm e1dfeild2"})
32+
promoted = []
33+
for link in promoted_listings:
34+
if "href" in link.attrs:
35+
promoted.append(link.attrs["href"])
36+
37+
organic_listings = bs.find(
38+
"div", {"data-cy": "search.listing.organic"}
39+
).find_all("a", {"class": "css-1hfdwlm e1dfeild2"})
40+
organic = []
41+
for link in organic_listings:
42+
if "href" in link.attrs:
43+
organic.append(link.attrs["href"])
44+
45+
return (promoted, organic)
46+
return None
47+
4648
def getListing(self, pageUrl, promoted):
47-
url = 'http://www.otodom.pl{}'.format(pageUrl)
48-
bs = self.getPage(url)
49-
if bs is not None:
50-
listing = dict()
51-
listing["url"] = url
52-
53-
id_match = re.search('[0-9]+$', bs.find('meta', {'name': 'description'}).get('content', ''))
54-
listing["otodom_id"] = id_match.group() if id_match else ""
55-
56-
listing["title"] = bs.find('h1', {'class': 'css-1wnihf5 efcnut38'}).text
57-
58-
localization = dict()
59-
l10n = bs.find('a', {'class': 'e1w8sadu0 css-1helwne exgq9l20'}).text.split(',')
60-
61-
localization["province"] = l10n[-1] if len(l10n) >= 4 else ""
62-
localization["city"] = l10n[-2] if len(l10n) >= 4 else l10n[-1]
63-
localization["district"] = l10n[1] if len(l10n) >= 4 else ""
64-
localization["street"] = l10n[0] if len(l10n) >= 4 else l10n[0]
65-
66-
listing["localization"] = localization
67-
68-
listing["promoted"] = promoted
69-
number = bs.find('strong', {'class': 'css-t3wmkv e1l1avn10'}).text.replace(',', '.')
70-
listing["price"] = int( float(re.sub(r'[^.0-9]', '', number)))
71-
number = bs.find('div', {'data-testid': 'table-value-rooms_num'}).text.replace(',', '.')
72-
listing["rooms"] = int( float(re.sub(r'[^.0-9]', '', number)))
73-
number = bs.find('div', {'data-testid': 'table-value-area'}).text.replace(',', '.')
74-
listing["area"] = int( float(re.sub(r'[^.0-9]', '', number)))
75-
listing["estate_agency"] = bs.find('div', {'data-testid': 'table-value-advertiser_type'}).text
76-
77-
return listing
78-
return None
79-
80-
def scrap_listings(self, url, check_all_pages=False):
49+
url = "http://www.otodom.pl{}".format(pageUrl)
50+
bs = self.getPage(url)
51+
if bs is not None:
52+
listing = dict()
53+
listing["url"] = url
54+
55+
id_match = re.search(
56+
"[0-9]+$", bs.find("meta", {"name": "description"}).get("content", "")
57+
)
58+
listing["otodom_id"] = id_match.group() if id_match else ""
59+
60+
listing["title"] = bs.find("h1", {"class": "css-1wnihf5 efcnut38"}).text
61+
62+
localization = dict()
63+
l10n = bs.find("a", {"class": "e1w8sadu0 css-1helwne exgq9l20"}).text.split(
64+
","
65+
)
66+
67+
localization["province"] = l10n[-1] if len(l10n) >= 4 else ""
68+
localization["city"] = l10n[-2] if len(l10n) >= 4 else l10n[-1]
69+
localization["district"] = l10n[1] if len(l10n) >= 4 else ""
70+
localization["street"] = l10n[0] if len(l10n) >= 4 else l10n[0]
71+
72+
listing["localization"] = localization
73+
74+
listing["promoted"] = promoted
75+
number = bs.find("strong", {"class": "css-t3wmkv e1l1avn10"}).text.replace(
76+
",", "."
77+
)
78+
listing["price"] = int(float(re.sub(r"[^.0-9]", "", number)))
79+
number = bs.find(
80+
"div", {"data-testid": "table-value-rooms_num"}
81+
).text.replace(",", ".")
82+
listing["rooms"] = int(float(re.sub(r"[^.0-9]", "", number)))
83+
number = bs.find("div", {"data-testid": "table-value-area"}).text.replace(
84+
",", "."
85+
)
86+
listing["area"] = int(float(re.sub(r"[^.0-9]", "", number)))
87+
listing["estate_agency"] = bs.find(
88+
"div", {"data-testid": "table-value-advertiser_type"}
89+
).text
90+
91+
return listing
92+
return None
8193

82-
bs = self.getPage(url)
83-
84-
if bs is not None:
85-
listings_url = set()
86-
listing_json = []
87-
88-
number_of_pages = 1
89-
if check_all_pages:
90-
page_numeration = bs.find_all('a', {'class': 'eo9qioj1 css-5tvc2l edo3iif1'})
91-
number_of_pages = max([int(num.text) for num in page_numeration])
92-
93-
for page_number in range(1, number_of_pages + 1):
94-
print(page_number)
95-
listing_links = self.getLinks(url + "?page={}".format(page_number))
96-
# listing_links = self.getLinks(url)
97-
# promoted ads
98-
for listing_url in listing_links[0]:
99-
if listing_url not in listings_url:
100-
listings_url.add(listing_url)
101-
listing_json.append(self.getListing(listing_url, promoted=True))
102-
103-
104-
# organic ads
105-
for listing_url in listing_links[1]:
106-
if listing_url not in listings_url:
107-
listings_url.add(listing_url)
108-
listing_json.append(self.getListing(listing_url, promoted=False))
109-
110-
with open('otodom_listing.json', 'w', encoding='utf-8') as json_file:
111-
json.dump(listing_json, json_file, ensure_ascii=False ,indent=2)
94+
def scrap_listings(self, url, check_all_pages=False):
95+
bs = self.getPage(url)
96+
97+
if bs is not None:
98+
listings_url = set()
99+
listing_json = []
100+
101+
number_of_pages = 1
102+
if check_all_pages:
103+
page_numeration = bs.find_all(
104+
"a", {"class": "eo9qioj1 css-5tvc2l edo3iif1"}
105+
)
106+
number_of_pages = max([int(num.text) for num in page_numeration])
107+
108+
for page_number in range(1, number_of_pages + 1):
109+
print(page_number)
110+
listing_links = self.getLinks(url + "?page={}".format(page_number))
111+
# listing_links = self.getLinks(url)
112+
# promoted ads
113+
for listing_url in listing_links[0]:
114+
if listing_url not in listings_url:
115+
listings_url.add(listing_url)
116+
listing_json.append(self.getListing(listing_url, promoted=True))
117+
118+
# organic ads
119+
for listing_url in listing_links[1]:
120+
if listing_url not in listings_url:
121+
listings_url.add(listing_url)
122+
listing_json.append(
123+
self.getListing(listing_url, promoted=False)
124+
)
125+
126+
with open("otodom_listing.json", "w", encoding="utf-8") as json_file:
127+
json.dump(listing_json, json_file, ensure_ascii=False, indent=2)
112128

113129
def generate_url(self):
114-
with open('otodom_settings.json') as f:
130+
with open("otodom_settings.json") as f:
115131
data = json.load(f)
116132
url = data["base_url"] + "pl/wyniki"
117-
133+
118134
if data["only_for_sale"]:
119135
url += "/sprzedaz"
120-
136+
121137
if data["only_for_rent"]:
122138
url += "/wynajem"
123139
url += "/" + data["property_type"] + "/"
@@ -130,21 +146,20 @@ def generate_url(self):
130146

131147
if len(data["price_min"]) > 0:
132148
url += "&priceMin=" + data["price_min"]
133-
149+
134150
if len(data["price_max"]) > 0:
135151
url += "&priceMax=" + data["price_max"]
136152

137153
url += "&by=LATEST&direction=DESC&viewType=listing"
138154
# print("Generated link:\n", url)
139155
return url
140-
141-
if __name__ == '__main__':
156+
157+
158+
if __name__ == "__main__":
142159
crawler = Crawler()
143160

144161
if len(sys.argv) > 2 and sys.argv[1] == "-u":
145162
print(sys.argv[2])
146163
crawler.scrap_listings(sys.argv[2], check_all_pages=False)
147164
else:
148165
crawler.scrap_listings(crawler.generate_url(), check_all_pages=False)
149-
150-

otodom/task_1/mk/otodom_listing.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -543,4 +543,4 @@
543543
"area": 55,
544544
"estate_agency": "prywatny"
545545
}
546-
]
546+
]

otodom/task_1/mk/otodom_settings.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,4 @@
77
"property_type": "mieszkanie",
88
"only_for_sale": false,
99
"only_for_rent": true
10-
}
10+
}

otodom/task_1/mk/requirements.txt

1 Byte
Binary file not shown.

0 commit comments

Comments
 (0)