-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
0b51fcd
commit b0064e8
Showing
11 changed files
with
349 additions
and
60 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from crawler.crawler import Crawler # noqa: F401 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
import concurrent.futures | ||
import json | ||
import logging | ||
|
||
import requests | ||
from bs4 import BeautifulSoup | ||
from crawler.utils import remove_duplicated_listings | ||
from listing import Listing | ||
from settings import Settings | ||
|
||
HEADERS = { | ||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" # noqa: E501 | ||
} | ||
|
||
|
||
class Crawler: | ||
""" | ||
A crawler for the otodom.pl website. | ||
The crawler is responsible for crawling the website and extracting the data. | ||
""" | ||
|
||
def __init__(self): | ||
""" | ||
Initialize the crawler. | ||
:param settings: The settings | ||
""" | ||
self.settings = Settings() | ||
self.params = self.generate_params() | ||
self.listings = list() | ||
|
||
def generate_search_url(self) -> str: | ||
""" | ||
Generate the URL to crawl. | ||
:return: The URL to crawl | ||
""" | ||
url = self.settings.base_url | ||
|
||
url += "/pl/wyniki/" | ||
url += self.settings.auction_type.value + "/" | ||
url += self.settings.property_type.value + "/" | ||
url += self.settings.province + "/" | ||
url += self.settings.city + "/" | ||
if self.settings.district is not None: | ||
url += self.settings.city + "/" | ||
url += self.settings.city + "/" | ||
url += self.settings.district + "/" | ||
|
||
return url | ||
|
||
def generate_params(self) -> dict: | ||
""" | ||
Generate the parameters for the URL. | ||
:return: The parameters for the URL | ||
""" | ||
return { | ||
"priceMin": self.settings.price_min, | ||
"priceMax": self.settings.price_max, | ||
} | ||
|
||
def count_pages(self) -> int: | ||
""" | ||
Count the number of pages to crawl. | ||
:return: The number of pages to crawl | ||
""" | ||
response = requests.get( | ||
url=self.generate_search_url(), params=self.params, headers=HEADERS | ||
) | ||
soup = BeautifulSoup(response.content, "html.parser") | ||
pages_element = soup.select("button[aria-current][data-cy]") | ||
if pages_element is None: | ||
logging.warning("No listings found with given parameters. Exiting...") | ||
exit(1) | ||
pages = pages_element[-1].text | ||
return int(pages) | ||
|
||
def extract_listings_from_page(self, page: int) -> set: | ||
""" | ||
Crawl the given page. | ||
:param page: The page number to crawl | ||
:return: The listings on the page | ||
""" | ||
params = self.params.copy() | ||
params["page"] = page | ||
response = requests.get( | ||
url=self.generate_search_url(), params=params, headers=HEADERS | ||
) | ||
soup = BeautifulSoup(response.content, "html.parser") | ||
listings = soup.select("li[data-cy=listing-item]") | ||
return listings | ||
|
||
def extract_listing_data(self, listing: Listing) -> Listing: | ||
""" | ||
Extract the data from the given listing. | ||
:param listing: The listing | ||
:return: The data from the listing | ||
""" | ||
response = requests.get(url=listing.link, headers=HEADERS) | ||
soup = BeautifulSoup(response.content, "html.parser") | ||
listing.extract_data_from_page(soup) | ||
return listing | ||
|
||
def get_listings(self) -> list: | ||
""" | ||
Get the listings. | ||
:return: The listings | ||
""" | ||
return self.listings | ||
|
||
def save_to_file(self, filename: str) -> None: | ||
""" | ||
Save the listings to a file. | ||
:param filename: The name of the file | ||
""" | ||
with open(filename, "w", encoding="utf-8") as file: | ||
json.dump( | ||
[obj.__dict__ for obj in self.listings], | ||
file, | ||
ensure_ascii=False, | ||
indent=4, | ||
) | ||
|
||
def start(self) -> None: | ||
""" | ||
Start the crawler. | ||
The crawler starts crawling the website and extracting the data. | ||
""" | ||
pages = self.count_pages() | ||
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: | ||
listings = list( | ||
executor.map(self.extract_listings_from_page, range(1, pages + 1)) | ||
) | ||
|
||
listings = remove_duplicated_listings(listings) | ||
listings = {Listing(listing) for listing in listings} | ||
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: | ||
listings = list(executor.map(self.extract_listing_data, listings)) | ||
|
||
self.listings = listings |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
from listing import Listing | ||
|
||
|
||
def remove_duplicated_listings(listings: list) -> set: | ||
""" | ||
Remove duplicated listings. | ||
:param listings: The listings | ||
:return: The listings without duplicates | ||
""" | ||
flattened_set = {value for sublist in listings for value in sublist} | ||
links = set() | ||
filtered_set = set() | ||
for listing in flattened_set: | ||
link = Listing.extract_link(listing) | ||
if link not in links: | ||
filtered_set.add(listing) | ||
links.add(link) | ||
|
||
return filtered_set |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from listing.listing import Listing # noqa: F401 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
import json | ||
|
||
from bs4 import ResultSet | ||
from settings.s_types import Defaults | ||
|
||
|
||
class Listing: | ||
""" | ||
A class that represents a listing on the otodom.pl website. | ||
""" | ||
|
||
def __init__(self, code: ResultSet): | ||
self.link = Defaults.DEFAULT_URL + self.extract_link(code) | ||
self.promoted = self.extract_promoted(code) | ||
self.province = "" | ||
self.city = "" | ||
self.district = "" | ||
self.street = "" | ||
self.otodom_id = "" | ||
self.title = "" | ||
self.price = 0 | ||
self.price_for_m2 = 0 | ||
self.offered_by = "" | ||
self.estate_agency_name = "" | ||
self.estate_agency_street = "" | ||
self.estate_agency_city = "" | ||
self.estate_agency_postal_code = "" | ||
self.estate_agency_county = "" | ||
self.estate_agency_province = "" | ||
|
||
def __repr__(self) -> dict: | ||
return self.__dict__.__repr__() | ||
|
||
@staticmethod | ||
def extract_link(code: ResultSet) -> str: | ||
""" | ||
Extracts the link from the HTML code. | ||
:param code: The HTML code containing the link | ||
:return: The extracted link | ||
""" | ||
return code.select_one("a")["href"] | ||
|
||
@staticmethod | ||
def extract_promoted(code: ResultSet) -> bool: | ||
""" | ||
Determines whether the listing is promoted. | ||
:param code: The HTML code containing the promotion status | ||
:return: True if the listing is promoted, False otherwise | ||
""" | ||
return code.select_one("article>span+div") is not None | ||
|
||
@staticmethod | ||
def extract_localization(properties: dict) -> (str, str, str, str): | ||
""" | ||
Extracts the localization details from the properties. | ||
:param properties: The properties containing the localization details | ||
:return: A tuple containing the province, city, district, and street | ||
""" | ||
province = properties["ad"]["location"]["address"]["province"]["code"] | ||
city = properties["ad"]["location"]["address"]["city"]["code"] | ||
district = properties["ad"]["location"]["address"].get("district", "") | ||
if isinstance(district, dict): | ||
district = district["name"] | ||
street = properties["ad"]["location"]["address"].get("street", "") | ||
if isinstance(street, dict): | ||
street = street["name"] | ||
return province, city, district, street | ||
|
||
@staticmethod | ||
def extract_offered_by(properties: dict) -> str: | ||
""" | ||
Determines the offer type from the properties. | ||
:param properties: The properties containing the offer type | ||
:return: The offer type | ||
""" | ||
return "private" if properties["ad"]["agency"] is None else "estate_agency" | ||
|
||
@staticmethod | ||
def extract_estate_agency_name(properties: dict) -> str: | ||
""" | ||
Extracts the name of the estate agency from the properties. | ||
:param properties: The properties containing the estate agency name | ||
:return: The name of the estate agency | ||
""" | ||
return properties["ad"]["agency"]["name"] | ||
|
||
@staticmethod | ||
def extract_estate_agency_details(properties: dict) -> str: | ||
""" | ||
Extracts the details of the estate agency from the properties. | ||
:param properties: The properties containing the estate agency details | ||
:return: The details of the estate agency | ||
""" | ||
address = properties["ad"]["agency"]["address"].strip().split(", ") | ||
if len(address) > 5: | ||
address = address[2:] | ||
return address[0], address[1], address[2], "".join(address[3:-1]), address[-1] | ||
|
||
def extract_data_from_page(self, code: ResultSet) -> None: | ||
""" | ||
Extracts data from the page and updates the Listing instance. | ||
This method loads the listing information from a script tag in the HTML code, | ||
parses it as JSON, and uses it to update the attributes of the Listing instance. | ||
:param code: The HTML code containing the listing information | ||
""" | ||
listing_information = json.loads( | ||
code.find("script", {"type": "application/json"}).text | ||
) | ||
listing_properties = listing_information["props"]["pageProps"] | ||
self.otodom_id = listing_properties["ad"]["id"] | ||
self.title = listing_properties["ad"]["title"] | ||
( | ||
self.province, | ||
self.city, | ||
self.district, | ||
self.street, | ||
) = self.extract_localization(listing_properties) | ||
self.price = listing_properties["ad"]["target"].get("Price", 0) | ||
self.price_for_m2 = listing_properties["ad"]["target"].get("Price_per_m", 0) | ||
self.offered_by = self.extract_offered_by(listing_properties) | ||
if self.offered_by == "estate_agency": | ||
( | ||
self.estate_agency_street, | ||
self.estate_agency_postal_code, | ||
self.estate_agency_city, | ||
self.estate_agency_county, | ||
self.estate_agency_province, | ||
) = self.extract_estate_agency_details(listing_properties) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
from crawler import Crawler | ||
|
||
if "__main__" == __name__: | ||
crawler = Crawler() | ||
crawler.start() | ||
crawler.save_to_file("listings.json") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
from settings import Settings # noqa: F401 | ||
from settings.settings import Settings # noqa: F401 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.