1
- from bs4 import BeautifulSoup
2
- import requests
3
- import re
4
1
import json
2
+ import re
5
3
import sys
6
4
5
+ import requests
6
+ from bs4 import BeautifulSoup
7
+
7
8
HEADERS = {
8
- "Access-Control-Allow-Origin" : "*" ,
9
- "Content-Encoding" : "gzip" ,
10
- "Content-Type" : "text/html; charset=utf-8" ,
11
- "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7" ,
12
- "Accept-Encoding" : "gzip, deflate, br" ,
13
- "Accept-Language" : "pl-PL,pl;q=0.9,en-US;q=0.8,en;q=0.7" ,
14
- "Sec-Ch-Ua" : "\" Chromium\" ;v=\" 118\" , \" Google Chrome\" ;v=\" 118\" , \" Not=A?Brand\" ;v=\" 99\" " ,
15
- "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36"
9
+ "Access-Control-Allow-Origin" : "*" ,
10
+ "Content-Type" : "text/html; charset=utf-8" ,
11
+ "Accept-Language" : "pl-PL,pl;q=0.9,en-US;q=0.8,en;q=0.7" ,
12
+ "Sec-Ch-Ua" : '"Chromium";v="118", "Google Chrome";v="118", "Not=A?Brand";v="99"' ,
13
+ "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36" , # noqa: E501
16
14
}
17
15
16
+
18
17
class Crawler :
19
-
20
18
def getPage (self , url ):
21
19
try :
22
- req = requests .get (url , headers = HEADERS )
20
+ req = requests .get (url , headers = HEADERS )
23
21
except requests .exceptions .RequestException :
24
- print (f' Failed to retrieve the webpage. Status code: { req .status_code } ' )
25
- return None
26
- return BeautifulSoup (req .text , "html.parser" )
27
-
22
+ print (f" Failed to retrieve the webpage. Status code: { req .status_code } " )
23
+ return None
24
+ return BeautifulSoup (req .text , "html.parser" )
25
+
28
26
def getLinks (self , url ):
29
- bs = self .getPage (url )
30
- if bs is not None :
31
- promoted_listings = bs .find ('div' , {'data-cy' : 'search.listing.promoted' }).find_all ('a' , {'class' : 'css-1hfdwlm e1dfeild2' })
32
- promoted = []
33
- for link in promoted_listings :
34
- if 'href' in link .attrs :
35
- promoted .append (link .attrs ['href' ])
36
-
37
- organic_listings = bs .find ('div' , {'data-cy' : 'search.listing.organic' }).find_all ('a' , {'class' : 'css-1hfdwlm e1dfeild2' })
38
- organic = []
39
- for link in organic_listings :
40
- if 'href' in link .attrs :
41
- organic .append (link .attrs ['href' ])
42
-
43
- return (promoted , organic )
44
- return None
45
-
27
+ bs = self .getPage (url )
28
+ if bs is not None :
29
+ promoted_listings = bs .find (
30
+ "div" , {"data-cy" : "search.listing.promoted" }
31
+ ).find_all ("a" , {"class" : "css-1hfdwlm e1dfeild2" })
32
+ promoted = []
33
+ for link in promoted_listings :
34
+ if "href" in link .attrs :
35
+ promoted .append (link .attrs ["href" ])
36
+
37
+ organic_listings = bs .find (
38
+ "div" , {"data-cy" : "search.listing.organic" }
39
+ ).find_all ("a" , {"class" : "css-1hfdwlm e1dfeild2" })
40
+ organic = []
41
+ for link in organic_listings :
42
+ if "href" in link .attrs :
43
+ organic .append (link .attrs ["href" ])
44
+
45
+ return (promoted , organic )
46
+ return None
47
+
46
48
def getListing (self , pageUrl , promoted ):
47
- url = 'http://www.otodom.pl{}' .format (pageUrl )
48
- bs = self .getPage (url )
49
- if bs is not None :
50
- listing = dict ()
51
- listing ["url" ] = url
52
-
53
- id_match = re .search ('[0-9]+$' , bs .find ('meta' , {'name' : 'description' }).get ('content' , '' ))
54
- listing ["otodom_id" ] = id_match .group () if id_match else ""
55
-
56
- listing ["title" ] = bs .find ('h1' , {'class' : 'css-1wnihf5 efcnut38' }).text
57
-
58
- localization = dict ()
59
- l10n = bs .find ('a' , {'class' : 'e1w8sadu0 css-1helwne exgq9l20' }).text .split (',' )
60
-
61
- localization ["province" ] = l10n [- 1 ] if len (l10n ) >= 4 else ""
62
- localization ["city" ] = l10n [- 2 ] if len (l10n ) >= 4 else l10n [- 1 ]
63
- localization ["district" ] = l10n [1 ] if len (l10n ) >= 4 else ""
64
- localization ["street" ] = l10n [0 ] if len (l10n ) >= 4 else l10n [0 ]
65
-
66
- listing ["localization" ] = localization
67
-
68
- listing ["promoted" ] = promoted
69
- number = bs .find ('strong' , {'class' : 'css-t3wmkv e1l1avn10' }).text .replace (',' , '.' )
70
- listing ["price" ] = int ( float (re .sub (r'[^.0-9]' , '' , number )))
71
- number = bs .find ('div' , {'data-testid' : 'table-value-rooms_num' }).text .replace (',' , '.' )
72
- listing ["rooms" ] = int ( float (re .sub (r'[^.0-9]' , '' , number )))
73
- number = bs .find ('div' , {'data-testid' : 'table-value-area' }).text .replace (',' , '.' )
74
- listing ["area" ] = int ( float (re .sub (r'[^.0-9]' , '' , number )))
75
- listing ["estate_agency" ] = bs .find ('div' , {'data-testid' : 'table-value-advertiser_type' }).text
76
-
77
- return listing
78
- return None
79
-
80
- def scrap_listings (self , url , check_all_pages = False ):
49
+ url = "http://www.otodom.pl{}" .format (pageUrl )
50
+ bs = self .getPage (url )
51
+ if bs is not None :
52
+ listing = dict ()
53
+ listing ["url" ] = url
54
+
55
+ id_match = re .search (
56
+ "[0-9]+$" , bs .find ("meta" , {"name" : "description" }).get ("content" , "" )
57
+ )
58
+ listing ["otodom_id" ] = id_match .group () if id_match else ""
59
+
60
+ listing ["title" ] = bs .find ("h1" , {"class" : "css-1wnihf5 efcnut38" }).text
61
+
62
+ localization = dict ()
63
+ l10n = bs .find ("a" , {"class" : "e1w8sadu0 css-1helwne exgq9l20" }).text .split (
64
+ ","
65
+ )
66
+
67
+ localization ["province" ] = l10n [- 1 ] if len (l10n ) >= 4 else ""
68
+ localization ["city" ] = l10n [- 2 ] if len (l10n ) >= 4 else l10n [- 1 ]
69
+ localization ["district" ] = l10n [1 ] if len (l10n ) >= 4 else ""
70
+ localization ["street" ] = l10n [0 ] if len (l10n ) >= 4 else l10n [0 ]
71
+
72
+ listing ["localization" ] = localization
73
+
74
+ listing ["promoted" ] = promoted
75
+ number = bs .find ("strong" , {"class" : "css-t3wmkv e1l1avn10" }).text .replace (
76
+ "," , "."
77
+ )
78
+ listing ["price" ] = int (float (re .sub (r"[^.0-9]" , "" , number )))
79
+ number = bs .find (
80
+ "div" , {"data-testid" : "table-value-rooms_num" }
81
+ ).text .replace ("," , "." )
82
+ listing ["rooms" ] = int (float (re .sub (r"[^.0-9]" , "" , number )))
83
+ number = bs .find ("div" , {"data-testid" : "table-value-area" }).text .replace (
84
+ "," , "."
85
+ )
86
+ listing ["area" ] = int (float (re .sub (r"[^.0-9]" , "" , number )))
87
+ listing ["estate_agency" ] = bs .find (
88
+ "div" , {"data-testid" : "table-value-advertiser_type" }
89
+ ).text
90
+
91
+ return listing
92
+ return None
81
93
82
- bs = self .getPage (url )
83
-
84
- if bs is not None :
85
- listings_url = set ()
86
- listing_json = []
87
-
88
- number_of_pages = 1
89
- if check_all_pages :
90
- page_numeration = bs .find_all ('a' , {'class' : 'eo9qioj1 css-5tvc2l edo3iif1' })
91
- number_of_pages = max ([int (num .text ) for num in page_numeration ])
92
-
93
- for page_number in range (1 , number_of_pages + 1 ):
94
- print (page_number )
95
- listing_links = self .getLinks (url + "?page={}" .format (page_number ))
96
- # listing_links = self.getLinks(url)
97
- # promoted ads
98
- for listing_url in listing_links [0 ]:
99
- if listing_url not in listings_url :
100
- listings_url .add (listing_url )
101
- listing_json .append (self .getListing (listing_url , promoted = True ))
102
-
103
-
104
- # organic ads
105
- for listing_url in listing_links [1 ]:
106
- if listing_url not in listings_url :
107
- listings_url .add (listing_url )
108
- listing_json .append (self .getListing (listing_url , promoted = False ))
109
-
110
- with open ('otodom_listing.json' , 'w' , encoding = 'utf-8' ) as json_file :
111
- json .dump (listing_json , json_file , ensure_ascii = False ,indent = 2 )
94
+ def scrap_listings (self , url , check_all_pages = False ):
95
+ bs = self .getPage (url )
96
+
97
+ if bs is not None :
98
+ listings_url = set ()
99
+ listing_json = []
100
+
101
+ number_of_pages = 1
102
+ if check_all_pages :
103
+ page_numeration = bs .find_all (
104
+ "a" , {"class" : "eo9qioj1 css-5tvc2l edo3iif1" }
105
+ )
106
+ number_of_pages = max ([int (num .text ) for num in page_numeration ])
107
+
108
+ for page_number in range (1 , number_of_pages + 1 ):
109
+ print (page_number )
110
+ listing_links = self .getLinks (url + "?page={}" .format (page_number ))
111
+ # listing_links = self.getLinks(url)
112
+ # promoted ads
113
+ for listing_url in listing_links [0 ]:
114
+ if listing_url not in listings_url :
115
+ listings_url .add (listing_url )
116
+ listing_json .append (self .getListing (listing_url , promoted = True ))
117
+
118
+ # organic ads
119
+ for listing_url in listing_links [1 ]:
120
+ if listing_url not in listings_url :
121
+ listings_url .add (listing_url )
122
+ listing_json .append (
123
+ self .getListing (listing_url , promoted = False )
124
+ )
125
+
126
+ with open ("otodom_listing.json" , "w" , encoding = "utf-8" ) as json_file :
127
+ json .dump (listing_json , json_file , ensure_ascii = False , indent = 2 )
112
128
113
129
def generate_url (self ):
114
- with open (' otodom_settings.json' ) as f :
130
+ with open (" otodom_settings.json" ) as f :
115
131
data = json .load (f )
116
132
url = data ["base_url" ] + "pl/wyniki"
117
-
133
+
118
134
if data ["only_for_sale" ]:
119
135
url += "/sprzedaz"
120
-
136
+
121
137
if data ["only_for_rent" ]:
122
138
url += "/wynajem"
123
139
url += "/" + data ["property_type" ] + "/"
@@ -130,21 +146,20 @@ def generate_url(self):
130
146
131
147
if len (data ["price_min" ]) > 0 :
132
148
url += "&priceMin=" + data ["price_min" ]
133
-
149
+
134
150
if len (data ["price_max" ]) > 0 :
135
151
url += "&priceMax=" + data ["price_max" ]
136
152
137
153
url += "&by=LATEST&direction=DESC&viewType=listing"
138
154
# print("Generated link:\n", url)
139
155
return url
140
-
141
- if __name__ == '__main__' :
156
+
157
+
158
+ if __name__ == "__main__" :
142
159
crawler = Crawler ()
143
160
144
161
if len (sys .argv ) > 2 and sys .argv [1 ] == "-u" :
145
162
print (sys .argv [2 ])
146
163
crawler .scrap_listings (sys .argv [2 ], check_all_pages = False )
147
164
else :
148
165
crawler .scrap_listings (crawler .generate_url (), check_all_pages = False )
149
-
150
-
0 commit comments