feat: scraper for listings

Kolo-Naukowe-Data-Science-PW · Dec 29, 2023 · b0064e8 · b0064e8
1 parent 0b51fcd
commit b0064e8
Show file tree

Hide file tree

Showing 11 changed files with 349 additions and 60 deletions.
diff --git a/otodom/task_2/sebastian_rydz/settings.json b/otodom/task_2/sebastian_rydz/settings.json
@@ -1,13 +1,13 @@
 {
-	"base_url": "https://www.otodom.pl",
     "price": {
-        "min": 1000,
-        "max": 5000
+        "min": 0,
+        "max": 10000000
     },
-	"province" : "kujawsko-pomorskie",
-	"city": "torun",
+    "district": "",
+	"province" : "warmińsko-mazurskie",
+	"city": "elblag",
 	"property_type": "flat",
-	"sale_or_rent": "rent",
+	"auction_type": "sale",
 	"_comments": {
         "property_type": "Can be: 'flat', 'studio', 'house', 'investment', 'room', 'plot', 'venue', 'magazine', 'garage'",
         "sale_or_rent": "Can be: 'sale', 'rent'"

diff --git a/otodom/task_2/sebastian_rydz/src/crawler/__init__.py b/otodom/task_2/sebastian_rydz/src/crawler/__init__.py
@@ -0,0 +1 @@
+from crawler.crawler import Crawler  # noqa: F401
diff --git a/otodom/task_2/sebastian_rydz/src/crawler/crawler.py b/otodom/task_2/sebastian_rydz/src/crawler/crawler.py
@@ -0,0 +1,148 @@
+import concurrent.futures
+import json
+import logging
+
+import requests
+from bs4 import BeautifulSoup
+from crawler.utils import remove_duplicated_listings
+from listing import Listing
+from settings import Settings
+
+HEADERS = {
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"  # noqa: E501
+}
+
+
+class Crawler:
+    """
+    A crawler for the otodom.pl website.
+
+    The crawler is responsible for crawling the website and extracting the data.
+    """
+
+    def __init__(self):
+        """
+        Initialize the crawler.
+
+        :param settings: The settings
+        """
+        self.settings = Settings()
+        self.params = self.generate_params()
+        self.listings = list()
+
+    def generate_search_url(self) -> str:
+        """
+        Generate the URL to crawl.
+
+        :return: The URL to crawl
+        """
+        url = self.settings.base_url
+
+        url += "/pl/wyniki/"
+        url += self.settings.auction_type.value + "/"
+        url += self.settings.property_type.value + "/"
+        url += self.settings.province + "/"
+        url += self.settings.city + "/"
+        if self.settings.district is not None:
+            url += self.settings.city + "/"
+            url += self.settings.city + "/"
+            url += self.settings.district + "/"
+
+        return url
+
+    def generate_params(self) -> dict:
+        """
+        Generate the parameters for the URL.
+
+        :return: The parameters for the URL
+        """
+        return {
+            "priceMin": self.settings.price_min,
+            "priceMax": self.settings.price_max,
+        }
+
+    def count_pages(self) -> int:
+        """
+        Count the number of pages to crawl.
+
+        :return: The number of pages to crawl
+        """
+        response = requests.get(
+            url=self.generate_search_url(), params=self.params, headers=HEADERS
+        )
+        soup = BeautifulSoup(response.content, "html.parser")
+        pages_element = soup.select("button[aria-current][data-cy]")
+        if pages_element is None:
+            logging.warning("No listings found with given parameters. Exiting...")
+            exit(1)
+        pages = pages_element[-1].text
+        return int(pages)
+
+    def extract_listings_from_page(self, page: int) -> set:
+        """
+        Crawl the given page.
+
+        :param page: The page number to crawl
+        :return: The listings on the page
+        """
+        params = self.params.copy()
+        params["page"] = page
+        response = requests.get(
+            url=self.generate_search_url(), params=params, headers=HEADERS
+        )
+        soup = BeautifulSoup(response.content, "html.parser")
+        listings = soup.select("li[data-cy=listing-item]")
+        return listings
+
+    def extract_listing_data(self, listing: Listing) -> Listing:
+        """
+        Extract the data from the given listing.
+
+        :param listing: The listing
+        :return: The data from the listing
+        """
+        response = requests.get(url=listing.link, headers=HEADERS)
+        soup = BeautifulSoup(response.content, "html.parser")
+        listing.extract_data_from_page(soup)
+        return listing
+
+    def get_listings(self) -> list:
+        """
+        Get the listings.
+
+        :return: The listings
+        """
+        return self.listings
+
+    def save_to_file(self, filename: str) -> None:
+        """
+        Save the listings to a file.
+
+        :param filename: The name of the file
+        """
+        with open(filename, "w", encoding="utf-8") as file:
+            json.dump(
+                [obj.__dict__ for obj in self.listings],
+                file,
+                ensure_ascii=False,
+                indent=4,
+            )
+
+    def start(self) -> None:
+        """
+        Start the crawler.
+
+        The crawler starts crawling the website and extracting the data.
+        """
+        pages = self.count_pages()
+        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
+            listings = list(
+                executor.map(self.extract_listings_from_page, range(1, pages + 1))
+            )
+
+        listings = remove_duplicated_listings(listings)
+        listings = {Listing(listing) for listing in listings}
+        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
+            listings = list(executor.map(self.extract_listing_data, listings))
+
+        self.listings = listings
diff --git a/otodom/task_2/sebastian_rydz/src/crawler/utils.py b/otodom/task_2/sebastian_rydz/src/crawler/utils.py
@@ -0,0 +1,20 @@
+from listing import Listing
+
+
+def remove_duplicated_listings(listings: list) -> set:
+    """
+    Remove duplicated listings.
+
+    :param listings: The listings
+    :return: The listings without duplicates
+    """
+    flattened_set = {value for sublist in listings for value in sublist}
+    links = set()
+    filtered_set = set()
+    for listing in flattened_set:
+        link = Listing.extract_link(listing)
+        if link not in links:
+            filtered_set.add(listing)
+        links.add(link)
+
+    return filtered_set
diff --git a/otodom/task_2/sebastian_rydz/src/listing/__init__.py b/otodom/task_2/sebastian_rydz/src/listing/__init__.py
@@ -0,0 +1 @@
+from listing.listing import Listing  # noqa: F401
diff --git a/otodom/task_2/sebastian_rydz/src/listing/listing.py b/otodom/task_2/sebastian_rydz/src/listing/listing.py
@@ -0,0 +1,136 @@
+import json
+
+from bs4 import ResultSet
+from settings.s_types import Defaults
+
+
+class Listing:
+    """
+    A class that represents a listing on the otodom.pl website.
+    """
+
+    def __init__(self, code: ResultSet):
+        self.link = Defaults.DEFAULT_URL + self.extract_link(code)
+        self.promoted = self.extract_promoted(code)
+        self.province = ""
+        self.city = ""
+        self.district = ""
+        self.street = ""
+        self.otodom_id = ""
+        self.title = ""
+        self.price = 0
+        self.price_for_m2 = 0
+        self.offered_by = ""
+        self.estate_agency_name = ""
+        self.estate_agency_street = ""
+        self.estate_agency_city = ""
+        self.estate_agency_postal_code = ""
+        self.estate_agency_county = ""
+        self.estate_agency_province = ""
+
+    def __repr__(self) -> dict:
+        return self.__dict__.__repr__()
+
+    @staticmethod
+    def extract_link(code: ResultSet) -> str:
+        """
+        Extracts the link from the HTML code.
+
+        :param code: The HTML code containing the link
+        :return: The extracted link
+        """
+        return code.select_one("a")["href"]
+
+    @staticmethod
+    def extract_promoted(code: ResultSet) -> bool:
+        """
+        Determines whether the listing is promoted.
+
+        :param code: The HTML code containing the promotion status
+        :return: True if the listing is promoted, False otherwise
+        """
+        return code.select_one("article>span+div") is not None
+
+    @staticmethod
+    def extract_localization(properties: dict) -> (str, str, str, str):
+        """
+        Extracts the localization details from the properties.
+
+        :param properties: The properties containing the localization details
+        :return: A tuple containing the province, city, district, and street
+        """
+        province = properties["ad"]["location"]["address"]["province"]["code"]
+        city = properties["ad"]["location"]["address"]["city"]["code"]
+        district = properties["ad"]["location"]["address"].get("district", "")
+        if isinstance(district, dict):
+            district = district["name"]
+        street = properties["ad"]["location"]["address"].get("street", "")
+        if isinstance(street, dict):
+            street = street["name"]
+        return province, city, district, street
+
+    @staticmethod
+    def extract_offered_by(properties: dict) -> str:
+        """
+        Determines the offer type from the properties.
+
+        :param properties: The properties containing the offer type
+        :return: The offer type
+        """
+        return "private" if properties["ad"]["agency"] is None else "estate_agency"
+
+    @staticmethod
+    def extract_estate_agency_name(properties: dict) -> str:
+        """
+        Extracts the name of the estate agency from the properties.
+
+        :param properties: The properties containing the estate agency name
+        :return: The name of the estate agency
+        """
+        return properties["ad"]["agency"]["name"]
+
+    @staticmethod
+    def extract_estate_agency_details(properties: dict) -> str:
+        """
+        Extracts the details of the estate agency from the properties.
+
+        :param properties: The properties containing the estate agency details
+        :return: The details of the estate agency
+        """
+        address = properties["ad"]["agency"]["address"].strip().split(", ")
+        if len(address) > 5:
+            address = address[2:]
+        return address[0], address[1], address[2], "".join(address[3:-1]), address[-1]
+
+    def extract_data_from_page(self, code: ResultSet) -> None:
+        """
+        Extracts data from the page and updates the Listing instance.
+
+        This method loads the listing information from a script tag in the HTML code,
+        parses it as JSON, and uses it to update the attributes of the Listing instance.
+
+        :param code: The HTML code containing the listing information
+        """
+        listing_information = json.loads(
+            code.find("script", {"type": "application/json"}).text
+        )
+        listing_properties = listing_information["props"]["pageProps"]
+        self.otodom_id = listing_properties["ad"]["id"]
+        self.title = listing_properties["ad"]["title"]
+        (
+            self.province,
+            self.city,
+            self.district,
+            self.street,
+        ) = self.extract_localization(listing_properties)
+        self.price = listing_properties["ad"]["target"].get("Price", 0)
+        self.price_for_m2 = listing_properties["ad"]["target"].get("Price_per_m", 0)
+        self.offered_by = self.extract_offered_by(listing_properties)
+        if self.offered_by == "estate_agency":
+            (
+                self.estate_agency_street,
+                self.estate_agency_postal_code,
+                self.estate_agency_city,
+                self.estate_agency_county,
+                self.estate_agency_province,
+            ) = self.extract_estate_agency_details(listing_properties)
diff --git a/otodom/task_2/sebastian_rydz/src/main.py b/otodom/task_2/sebastian_rydz/src/main.py
@@ -0,0 +1,6 @@
+from crawler import Crawler
+
+if "__main__" == __name__:
+    crawler = Crawler()
+    crawler.start()
+    crawler.save_to_file("listings.json")
diff --git a/otodom/task_2/sebastian_rydz/src/settings/__init__.py b/otodom/task_2/sebastian_rydz/src/settings/__init__.py
@@ -1 +1 @@
-from settings import Settings  # noqa: F401
+from settings.settings import Settings  # noqa: F401
diff --git a/otodom/task_2/sebastian_rydz/src/settings/s_types.py b/otodom/task_2/sebastian_rydz/src/settings/s_types.py
@@ -1,6 +1,23 @@
 from enum import Enum
 
 
+class PropertyType(Enum):
+    FLAT = "mieszkanie"
+    STUDIO = "kawalerka"
+    HOUSE = "dom"
+    INVESTMENT = "inwestycja"
+    ROOM = "pokoj"
+    PLOT = "dzialka"
+    VENUE = "lokal"
+    MAGAZINE = "haleimagazyny"
+    GARAGE = "garaz"
+
+
+class AuctionType(Enum):
+    SALE = "sprzedaz"
+    RENT = "wynajem"
+
+
 class Defaults:
     """
     A class that provides default values for the settings used by the application.
@@ -23,22 +40,5 @@ class Defaults:
     DEFAULT_PROVINCE = "mazowieckie"
     DEFAULT_CITY = "warszawa"
     DEFAULT_DISTRICT = None
-    DEFAULT_PROPERTY_TYPE = "mieszkanie"
-    DEFAULT_AUCTION_TYPE = "sprzedaz"
-
-
-class PropertyType(Enum):
-    FLAT = "mieszkanie"
-    STUDIO = "kawalerka"
-    HOUSE = "dom"
-    INVESTMENT = "inwestycja"
-    ROOM = "pokoj"
-    PLOT = "dzialka"
-    VENUE = "lokal"
-    MAGAZINE = "haleimagazyny"
-    GARAGE = "garaz"
-
-
-class AuctionType(Enum):
-    SALE = "sprzedaz"
-    RENT = "wynajem"
+    DEFAULT_PROPERTY_TYPE = PropertyType.FLAT
+    DEFAULT_AUCTION_TYPE = AuctionType.SALE
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from crawler.crawler import Crawler # noqa: F401
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from listing.listing import Listing # noqa: F401
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		from settings import Settings # noqa: F401
		from settings.settings import Settings # noqa: F401