From b0064e828b3f70c4fed23ffba8832b6c77f2373d Mon Sep 17 00:00:00 2001 From: TheRealSeber Date: Fri, 29 Dec 2023 12:43:20 +0100 Subject: [PATCH] feat: scraper for listings --- otodom/task_2/sebastian_rydz/settings.json | 12 +- .../sebastian_rydz/src/crawler/__init__.py | 1 + .../sebastian_rydz/src/crawler/crawler.py | 148 ++++++++++++++++++ .../sebastian_rydz/src/crawler/utils.py | 20 +++ .../sebastian_rydz/src/listing/__init__.py | 1 + .../sebastian_rydz/src/listing/listing.py | 136 ++++++++++++++++ otodom/task_2/sebastian_rydz/src/main.py | 6 + .../sebastian_rydz/src/settings/__init__.py | 2 +- .../sebastian_rydz/src/settings/s_types.py | 38 ++--- .../sebastian_rydz/src/settings/settings.py | 41 ++--- .../sebastian_rydz/src/settings/utils.py | 4 +- 11 files changed, 349 insertions(+), 60 deletions(-) create mode 100644 otodom/task_2/sebastian_rydz/src/crawler/__init__.py create mode 100644 otodom/task_2/sebastian_rydz/src/crawler/crawler.py create mode 100644 otodom/task_2/sebastian_rydz/src/crawler/utils.py create mode 100644 otodom/task_2/sebastian_rydz/src/listing/__init__.py create mode 100644 otodom/task_2/sebastian_rydz/src/listing/listing.py diff --git a/otodom/task_2/sebastian_rydz/settings.json b/otodom/task_2/sebastian_rydz/settings.json index 913c350..c006c6f 100644 --- a/otodom/task_2/sebastian_rydz/settings.json +++ b/otodom/task_2/sebastian_rydz/settings.json @@ -1,13 +1,13 @@ { - "base_url": "https://www.otodom.pl", "price": { - "min": 1000, - "max": 5000 + "min": 0, + "max": 10000000 }, - "province" : "kujawsko-pomorskie", - "city": "torun", + "district": "", + "province" : "warmiƄsko-mazurskie", + "city": "elblag", "property_type": "flat", - "sale_or_rent": "rent", + "auction_type": "sale", "_comments": { "property_type": "Can be: 'flat', 'studio', 'house', 'investment', 'room', 'plot', 'venue', 'magazine', 'garage'", "sale_or_rent": "Can be: 'sale', 'rent'" diff --git a/otodom/task_2/sebastian_rydz/src/crawler/__init__.py b/otodom/task_2/sebastian_rydz/src/crawler/__init__.py new file mode 100644 index 0000000..919212a --- /dev/null +++ b/otodom/task_2/sebastian_rydz/src/crawler/__init__.py @@ -0,0 +1 @@ +from crawler.crawler import Crawler # noqa: F401 diff --git a/otodom/task_2/sebastian_rydz/src/crawler/crawler.py b/otodom/task_2/sebastian_rydz/src/crawler/crawler.py new file mode 100644 index 0000000..910f907 --- /dev/null +++ b/otodom/task_2/sebastian_rydz/src/crawler/crawler.py @@ -0,0 +1,148 @@ +import concurrent.futures +import json +import logging + +import requests +from bs4 import BeautifulSoup +from crawler.utils import remove_duplicated_listings +from listing import Listing +from settings import Settings + +HEADERS = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" # noqa: E501 +} + + +class Crawler: + """ + A crawler for the otodom.pl website. + + The crawler is responsible for crawling the website and extracting the data. + """ + + def __init__(self): + """ + Initialize the crawler. + + :param settings: The settings + """ + self.settings = Settings() + self.params = self.generate_params() + self.listings = list() + + def generate_search_url(self) -> str: + """ + Generate the URL to crawl. + + :return: The URL to crawl + """ + url = self.settings.base_url + + url += "/pl/wyniki/" + url += self.settings.auction_type.value + "/" + url += self.settings.property_type.value + "/" + url += self.settings.province + "/" + url += self.settings.city + "/" + if self.settings.district is not None: + url += self.settings.city + "/" + url += self.settings.city + "/" + url += self.settings.district + "/" + + return url + + def generate_params(self) -> dict: + """ + Generate the parameters for the URL. + + :return: The parameters for the URL + """ + return { + "priceMin": self.settings.price_min, + "priceMax": self.settings.price_max, + } + + def count_pages(self) -> int: + """ + Count the number of pages to crawl. + + :return: The number of pages to crawl + """ + response = requests.get( + url=self.generate_search_url(), params=self.params, headers=HEADERS + ) + soup = BeautifulSoup(response.content, "html.parser") + pages_element = soup.select("button[aria-current][data-cy]") + if pages_element is None: + logging.warning("No listings found with given parameters. Exiting...") + exit(1) + pages = pages_element[-1].text + return int(pages) + + def extract_listings_from_page(self, page: int) -> set: + """ + Crawl the given page. + + :param page: The page number to crawl + :return: The listings on the page + """ + params = self.params.copy() + params["page"] = page + response = requests.get( + url=self.generate_search_url(), params=params, headers=HEADERS + ) + soup = BeautifulSoup(response.content, "html.parser") + listings = soup.select("li[data-cy=listing-item]") + return listings + + def extract_listing_data(self, listing: Listing) -> Listing: + """ + Extract the data from the given listing. + + :param listing: The listing + :return: The data from the listing + """ + response = requests.get(url=listing.link, headers=HEADERS) + soup = BeautifulSoup(response.content, "html.parser") + listing.extract_data_from_page(soup) + return listing + + def get_listings(self) -> list: + """ + Get the listings. + + :return: The listings + """ + return self.listings + + def save_to_file(self, filename: str) -> None: + """ + Save the listings to a file. + + :param filename: The name of the file + """ + with open(filename, "w", encoding="utf-8") as file: + json.dump( + [obj.__dict__ for obj in self.listings], + file, + ensure_ascii=False, + indent=4, + ) + + def start(self) -> None: + """ + Start the crawler. + + The crawler starts crawling the website and extracting the data. + """ + pages = self.count_pages() + with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: + listings = list( + executor.map(self.extract_listings_from_page, range(1, pages + 1)) + ) + + listings = remove_duplicated_listings(listings) + listings = {Listing(listing) for listing in listings} + with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: + listings = list(executor.map(self.extract_listing_data, listings)) + + self.listings = listings diff --git a/otodom/task_2/sebastian_rydz/src/crawler/utils.py b/otodom/task_2/sebastian_rydz/src/crawler/utils.py new file mode 100644 index 0000000..1ab4109 --- /dev/null +++ b/otodom/task_2/sebastian_rydz/src/crawler/utils.py @@ -0,0 +1,20 @@ +from listing import Listing + + +def remove_duplicated_listings(listings: list) -> set: + """ + Remove duplicated listings. + + :param listings: The listings + :return: The listings without duplicates + """ + flattened_set = {value for sublist in listings for value in sublist} + links = set() + filtered_set = set() + for listing in flattened_set: + link = Listing.extract_link(listing) + if link not in links: + filtered_set.add(listing) + links.add(link) + + return filtered_set diff --git a/otodom/task_2/sebastian_rydz/src/listing/__init__.py b/otodom/task_2/sebastian_rydz/src/listing/__init__.py new file mode 100644 index 0000000..3d2d76c --- /dev/null +++ b/otodom/task_2/sebastian_rydz/src/listing/__init__.py @@ -0,0 +1 @@ +from listing.listing import Listing # noqa: F401 diff --git a/otodom/task_2/sebastian_rydz/src/listing/listing.py b/otodom/task_2/sebastian_rydz/src/listing/listing.py new file mode 100644 index 0000000..e622610 --- /dev/null +++ b/otodom/task_2/sebastian_rydz/src/listing/listing.py @@ -0,0 +1,136 @@ +import json + +from bs4 import ResultSet +from settings.s_types import Defaults + + +class Listing: + """ + A class that represents a listing on the otodom.pl website. + """ + + def __init__(self, code: ResultSet): + self.link = Defaults.DEFAULT_URL + self.extract_link(code) + self.promoted = self.extract_promoted(code) + self.province = "" + self.city = "" + self.district = "" + self.street = "" + self.otodom_id = "" + self.title = "" + self.price = 0 + self.price_for_m2 = 0 + self.offered_by = "" + self.estate_agency_name = "" + self.estate_agency_street = "" + self.estate_agency_city = "" + self.estate_agency_postal_code = "" + self.estate_agency_county = "" + self.estate_agency_province = "" + + def __repr__(self) -> dict: + return self.__dict__.__repr__() + + @staticmethod + def extract_link(code: ResultSet) -> str: + """ + Extracts the link from the HTML code. + + :param code: The HTML code containing the link + :return: The extracted link + """ + return code.select_one("a")["href"] + + @staticmethod + def extract_promoted(code: ResultSet) -> bool: + """ + Determines whether the listing is promoted. + + :param code: The HTML code containing the promotion status + :return: True if the listing is promoted, False otherwise + """ + return code.select_one("article>span+div") is not None + + @staticmethod + def extract_localization(properties: dict) -> (str, str, str, str): + """ + Extracts the localization details from the properties. + + :param properties: The properties containing the localization details + :return: A tuple containing the province, city, district, and street + """ + province = properties["ad"]["location"]["address"]["province"]["code"] + city = properties["ad"]["location"]["address"]["city"]["code"] + district = properties["ad"]["location"]["address"].get("district", "") + if isinstance(district, dict): + district = district["name"] + street = properties["ad"]["location"]["address"].get("street", "") + if isinstance(street, dict): + street = street["name"] + return province, city, district, street + + @staticmethod + def extract_offered_by(properties: dict) -> str: + """ + Determines the offer type from the properties. + + :param properties: The properties containing the offer type + :return: The offer type + """ + return "private" if properties["ad"]["agency"] is None else "estate_agency" + + @staticmethod + def extract_estate_agency_name(properties: dict) -> str: + """ + Extracts the name of the estate agency from the properties. + + :param properties: The properties containing the estate agency name + :return: The name of the estate agency + """ + return properties["ad"]["agency"]["name"] + + @staticmethod + def extract_estate_agency_details(properties: dict) -> str: + """ + Extracts the details of the estate agency from the properties. + + :param properties: The properties containing the estate agency details + :return: The details of the estate agency + """ + address = properties["ad"]["agency"]["address"].strip().split(", ") + if len(address) > 5: + address = address[2:] + return address[0], address[1], address[2], "".join(address[3:-1]), address[-1] + + def extract_data_from_page(self, code: ResultSet) -> None: + """ + Extracts data from the page and updates the Listing instance. + + This method loads the listing information from a script tag in the HTML code, + parses it as JSON, and uses it to update the attributes of the Listing instance. + + :param code: The HTML code containing the listing information + """ + listing_information = json.loads( + code.find("script", {"type": "application/json"}).text + ) + listing_properties = listing_information["props"]["pageProps"] + self.otodom_id = listing_properties["ad"]["id"] + self.title = listing_properties["ad"]["title"] + ( + self.province, + self.city, + self.district, + self.street, + ) = self.extract_localization(listing_properties) + self.price = listing_properties["ad"]["target"].get("Price", 0) + self.price_for_m2 = listing_properties["ad"]["target"].get("Price_per_m", 0) + self.offered_by = self.extract_offered_by(listing_properties) + if self.offered_by == "estate_agency": + ( + self.estate_agency_street, + self.estate_agency_postal_code, + self.estate_agency_city, + self.estate_agency_county, + self.estate_agency_province, + ) = self.extract_estate_agency_details(listing_properties) diff --git a/otodom/task_2/sebastian_rydz/src/main.py b/otodom/task_2/sebastian_rydz/src/main.py index e69de29..922c0c3 100644 --- a/otodom/task_2/sebastian_rydz/src/main.py +++ b/otodom/task_2/sebastian_rydz/src/main.py @@ -0,0 +1,6 @@ +from crawler import Crawler + +if "__main__" == __name__: + crawler = Crawler() + crawler.start() + crawler.save_to_file("listings.json") diff --git a/otodom/task_2/sebastian_rydz/src/settings/__init__.py b/otodom/task_2/sebastian_rydz/src/settings/__init__.py index dbdcdc8..a5b78bc 100644 --- a/otodom/task_2/sebastian_rydz/src/settings/__init__.py +++ b/otodom/task_2/sebastian_rydz/src/settings/__init__.py @@ -1 +1 @@ -from settings import Settings # noqa: F401 +from settings.settings import Settings # noqa: F401 diff --git a/otodom/task_2/sebastian_rydz/src/settings/s_types.py b/otodom/task_2/sebastian_rydz/src/settings/s_types.py index 2652a7b..ac39370 100644 --- a/otodom/task_2/sebastian_rydz/src/settings/s_types.py +++ b/otodom/task_2/sebastian_rydz/src/settings/s_types.py @@ -1,6 +1,23 @@ from enum import Enum +class PropertyType(Enum): + FLAT = "mieszkanie" + STUDIO = "kawalerka" + HOUSE = "dom" + INVESTMENT = "inwestycja" + ROOM = "pokoj" + PLOT = "dzialka" + VENUE = "lokal" + MAGAZINE = "haleimagazyny" + GARAGE = "garaz" + + +class AuctionType(Enum): + SALE = "sprzedaz" + RENT = "wynajem" + + class Defaults: """ A class that provides default values for the settings used by the application. @@ -23,22 +40,5 @@ class Defaults: DEFAULT_PROVINCE = "mazowieckie" DEFAULT_CITY = "warszawa" DEFAULT_DISTRICT = None - DEFAULT_PROPERTY_TYPE = "mieszkanie" - DEFAULT_AUCTION_TYPE = "sprzedaz" - - -class PropertyType(Enum): - FLAT = "mieszkanie" - STUDIO = "kawalerka" - HOUSE = "dom" - INVESTMENT = "inwestycja" - ROOM = "pokoj" - PLOT = "dzialka" - VENUE = "lokal" - MAGAZINE = "haleimagazyny" - GARAGE = "garaz" - - -class AuctionType(Enum): - SALE = "sprzedaz" - RENT = "wynajem" + DEFAULT_PROPERTY_TYPE = PropertyType.FLAT + DEFAULT_AUCTION_TYPE = AuctionType.SALE diff --git a/otodom/task_2/sebastian_rydz/src/settings/settings.py b/otodom/task_2/sebastian_rydz/src/settings/settings.py index 515e2d8..583e2b2 100644 --- a/otodom/task_2/sebastian_rydz/src/settings/settings.py +++ b/otodom/task_2/sebastian_rydz/src/settings/settings.py @@ -1,12 +1,12 @@ import json import logging -from s_types import AuctionType -from s_types import Defaults -from s_types import PropertyType -from utils import get_auction_type -from utils import get_property_type -from utils import replace_polish_characters +from settings.s_types import AuctionType +from settings.s_types import Defaults +from settings.s_types import PropertyType +from settings.utils import get_auction_type +from settings.utils import get_property_type +from settings.utils import replace_polish_characters AVAILABLE_PROVINCES = [ "dolnoslaskie", @@ -68,7 +68,7 @@ def __init__(self): try: with open("settings.json", "r", encoding="utf-8") as f: settings = json.load(f) - self.base_url = self.__init_base_url(settings) + self.base_url = Defaults.DEFAULT_URL self.price_min, self.price_max = self.__init_price(settings) self.province = self.__init_province(settings) self.city = self.__init_city(settings) @@ -83,25 +83,6 @@ def __init__(self): ) self.set_default() - @staticmethod - def __init_base_url(settings: dict) -> str: - """ - Initialize the base URL from the settings dictionary. - - If the base URL is not a string or does not start with "https://www.otodom.pl", - a warning message is logged and the default base URL is returned. - - :param settings: A dictionary containing the settings - :return: The base URL - """ - base_url = settings.get("base_url") - if not isinstance(base_url, str) or not base_url.startswith( - "https://www.otodom.pl" - ): - logging.warning("Base url is not correct. Base url is set to default") - return Defaults.DEFAULT_URL - return base_url - @staticmethod def __init_price(settings: dict) -> (int, int): """ @@ -159,6 +140,7 @@ def __init_province(settings: dict) -> str: if province not in AVAILABLE_PROVINCES: logging.warning("Province is not correct. Province is set to default") return Defaults.DEFAULT_PROVINCE + province = province.replace("-", "--") return province @staticmethod @@ -190,7 +172,7 @@ def __init_district(settings: dict) -> str: :return: The district """ district = settings.get("district") - if not isinstance(district, str): + if not isinstance(district, str) or district == "": logging.warning("District is not correct. District is set to default") return Defaults.DEFAULT_DISTRICT return replace_polish_characters(district) @@ -262,8 +244,3 @@ def set_default(self): self.district = Defaults.DEFAULT_DISTRICT self.property_type = Defaults.DEFAULT_PROPERTY_TYPE self.auction_type = Defaults.DEFAULT_AUCTION_TYPE - - -if __name__ == "__main__": - settings = Settings() - print(settings.__dict__) diff --git a/otodom/task_2/sebastian_rydz/src/settings/utils.py b/otodom/task_2/sebastian_rydz/src/settings/utils.py index 2689655..416940a 100644 --- a/otodom/task_2/sebastian_rydz/src/settings/utils.py +++ b/otodom/task_2/sebastian_rydz/src/settings/utils.py @@ -1,5 +1,5 @@ -from s_types import AuctionType -from s_types import PropertyType +from settings.s_types import AuctionType +from settings.s_types import PropertyType AUCTION_TYPE_MAPPING = { "sale": AuctionType.SALE,