Skip to content

Commit

Permalink
feat: scraper for listings
Browse files Browse the repository at this point in the history
  • Loading branch information
TheRealSeber committed Dec 29, 2023
1 parent 0b51fcd commit b0064e8
Show file tree
Hide file tree
Showing 11 changed files with 349 additions and 60 deletions.
12 changes: 6 additions & 6 deletions otodom/task_2/sebastian_rydz/settings.json
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
{
"base_url": "https://www.otodom.pl",
"price": {
"min": 1000,
"max": 5000
"min": 0,
"max": 10000000
},
"province" : "kujawsko-pomorskie",
"city": "torun",
"district": "",
"province" : "warmińsko-mazurskie",
"city": "elblag",
"property_type": "flat",
"sale_or_rent": "rent",
"auction_type": "sale",
"_comments": {
"property_type": "Can be: 'flat', 'studio', 'house', 'investment', 'room', 'plot', 'venue', 'magazine', 'garage'",
"sale_or_rent": "Can be: 'sale', 'rent'"
Expand Down
1 change: 1 addition & 0 deletions otodom/task_2/sebastian_rydz/src/crawler/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from crawler.crawler import Crawler # noqa: F401
148 changes: 148 additions & 0 deletions otodom/task_2/sebastian_rydz/src/crawler/crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
import concurrent.futures
import json
import logging

import requests
from bs4 import BeautifulSoup
from crawler.utils import remove_duplicated_listings
from listing import Listing
from settings import Settings

HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3" # noqa: E501
}


class Crawler:
"""
A crawler for the otodom.pl website.
The crawler is responsible for crawling the website and extracting the data.
"""

def __init__(self):
"""
Initialize the crawler.
:param settings: The settings
"""
self.settings = Settings()
self.params = self.generate_params()
self.listings = list()

def generate_search_url(self) -> str:
"""
Generate the URL to crawl.
:return: The URL to crawl
"""
url = self.settings.base_url

url += "/pl/wyniki/"
url += self.settings.auction_type.value + "/"
url += self.settings.property_type.value + "/"
url += self.settings.province + "/"
url += self.settings.city + "/"
if self.settings.district is not None:
url += self.settings.city + "/"
url += self.settings.city + "/"
url += self.settings.district + "/"

return url

def generate_params(self) -> dict:
"""
Generate the parameters for the URL.
:return: The parameters for the URL
"""
return {
"priceMin": self.settings.price_min,
"priceMax": self.settings.price_max,
}

def count_pages(self) -> int:
"""
Count the number of pages to crawl.
:return: The number of pages to crawl
"""
response = requests.get(
url=self.generate_search_url(), params=self.params, headers=HEADERS
)
soup = BeautifulSoup(response.content, "html.parser")
pages_element = soup.select("button[aria-current][data-cy]")
if pages_element is None:
logging.warning("No listings found with given parameters. Exiting...")
exit(1)
pages = pages_element[-1].text
return int(pages)

def extract_listings_from_page(self, page: int) -> set:
"""
Crawl the given page.
:param page: The page number to crawl
:return: The listings on the page
"""
params = self.params.copy()
params["page"] = page
response = requests.get(
url=self.generate_search_url(), params=params, headers=HEADERS
)
soup = BeautifulSoup(response.content, "html.parser")
listings = soup.select("li[data-cy=listing-item]")
return listings

def extract_listing_data(self, listing: Listing) -> Listing:
"""
Extract the data from the given listing.
:param listing: The listing
:return: The data from the listing
"""
response = requests.get(url=listing.link, headers=HEADERS)
soup = BeautifulSoup(response.content, "html.parser")
listing.extract_data_from_page(soup)
return listing

def get_listings(self) -> list:
"""
Get the listings.
:return: The listings
"""
return self.listings

def save_to_file(self, filename: str) -> None:
"""
Save the listings to a file.
:param filename: The name of the file
"""
with open(filename, "w", encoding="utf-8") as file:
json.dump(
[obj.__dict__ for obj in self.listings],
file,
ensure_ascii=False,
indent=4,
)

def start(self) -> None:
"""
Start the crawler.
The crawler starts crawling the website and extracting the data.
"""
pages = self.count_pages()
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
listings = list(
executor.map(self.extract_listings_from_page, range(1, pages + 1))
)

listings = remove_duplicated_listings(listings)
listings = {Listing(listing) for listing in listings}
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
listings = list(executor.map(self.extract_listing_data, listings))

self.listings = listings
20 changes: 20 additions & 0 deletions otodom/task_2/sebastian_rydz/src/crawler/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from listing import Listing


def remove_duplicated_listings(listings: list) -> set:
"""
Remove duplicated listings.
:param listings: The listings
:return: The listings without duplicates
"""
flattened_set = {value for sublist in listings for value in sublist}
links = set()
filtered_set = set()
for listing in flattened_set:
link = Listing.extract_link(listing)
if link not in links:
filtered_set.add(listing)
links.add(link)

return filtered_set
1 change: 1 addition & 0 deletions otodom/task_2/sebastian_rydz/src/listing/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from listing.listing import Listing # noqa: F401
136 changes: 136 additions & 0 deletions otodom/task_2/sebastian_rydz/src/listing/listing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import json

from bs4 import ResultSet
from settings.s_types import Defaults


class Listing:
"""
A class that represents a listing on the otodom.pl website.
"""

def __init__(self, code: ResultSet):
self.link = Defaults.DEFAULT_URL + self.extract_link(code)
self.promoted = self.extract_promoted(code)
self.province = ""
self.city = ""
self.district = ""
self.street = ""
self.otodom_id = ""
self.title = ""
self.price = 0
self.price_for_m2 = 0
self.offered_by = ""
self.estate_agency_name = ""
self.estate_agency_street = ""
self.estate_agency_city = ""
self.estate_agency_postal_code = ""
self.estate_agency_county = ""
self.estate_agency_province = ""

def __repr__(self) -> dict:
return self.__dict__.__repr__()

@staticmethod
def extract_link(code: ResultSet) -> str:
"""
Extracts the link from the HTML code.
:param code: The HTML code containing the link
:return: The extracted link
"""
return code.select_one("a")["href"]

@staticmethod
def extract_promoted(code: ResultSet) -> bool:
"""
Determines whether the listing is promoted.
:param code: The HTML code containing the promotion status
:return: True if the listing is promoted, False otherwise
"""
return code.select_one("article>span+div") is not None

@staticmethod
def extract_localization(properties: dict) -> (str, str, str, str):
"""
Extracts the localization details from the properties.
:param properties: The properties containing the localization details
:return: A tuple containing the province, city, district, and street
"""
province = properties["ad"]["location"]["address"]["province"]["code"]
city = properties["ad"]["location"]["address"]["city"]["code"]
district = properties["ad"]["location"]["address"].get("district", "")
if isinstance(district, dict):
district = district["name"]
street = properties["ad"]["location"]["address"].get("street", "")
if isinstance(street, dict):
street = street["name"]
return province, city, district, street

@staticmethod
def extract_offered_by(properties: dict) -> str:
"""
Determines the offer type from the properties.
:param properties: The properties containing the offer type
:return: The offer type
"""
return "private" if properties["ad"]["agency"] is None else "estate_agency"

@staticmethod
def extract_estate_agency_name(properties: dict) -> str:
"""
Extracts the name of the estate agency from the properties.
:param properties: The properties containing the estate agency name
:return: The name of the estate agency
"""
return properties["ad"]["agency"]["name"]

@staticmethod
def extract_estate_agency_details(properties: dict) -> str:
"""
Extracts the details of the estate agency from the properties.
:param properties: The properties containing the estate agency details
:return: The details of the estate agency
"""
address = properties["ad"]["agency"]["address"].strip().split(", ")
if len(address) > 5:
address = address[2:]
return address[0], address[1], address[2], "".join(address[3:-1]), address[-1]

def extract_data_from_page(self, code: ResultSet) -> None:
"""
Extracts data from the page and updates the Listing instance.
This method loads the listing information from a script tag in the HTML code,
parses it as JSON, and uses it to update the attributes of the Listing instance.
:param code: The HTML code containing the listing information
"""
listing_information = json.loads(
code.find("script", {"type": "application/json"}).text
)
listing_properties = listing_information["props"]["pageProps"]
self.otodom_id = listing_properties["ad"]["id"]
self.title = listing_properties["ad"]["title"]
(
self.province,
self.city,
self.district,
self.street,
) = self.extract_localization(listing_properties)
self.price = listing_properties["ad"]["target"].get("Price", 0)
self.price_for_m2 = listing_properties["ad"]["target"].get("Price_per_m", 0)
self.offered_by = self.extract_offered_by(listing_properties)
if self.offered_by == "estate_agency":
(
self.estate_agency_street,
self.estate_agency_postal_code,
self.estate_agency_city,
self.estate_agency_county,
self.estate_agency_province,
) = self.extract_estate_agency_details(listing_properties)
6 changes: 6 additions & 0 deletions otodom/task_2/sebastian_rydz/src/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from crawler import Crawler

if "__main__" == __name__:
crawler = Crawler()
crawler.start()
crawler.save_to_file("listings.json")
2 changes: 1 addition & 1 deletion otodom/task_2/sebastian_rydz/src/settings/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from settings import Settings # noqa: F401
from settings.settings import Settings # noqa: F401
38 changes: 19 additions & 19 deletions otodom/task_2/sebastian_rydz/src/settings/s_types.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,23 @@
from enum import Enum


class PropertyType(Enum):
FLAT = "mieszkanie"
STUDIO = "kawalerka"
HOUSE = "dom"
INVESTMENT = "inwestycja"
ROOM = "pokoj"
PLOT = "dzialka"
VENUE = "lokal"
MAGAZINE = "haleimagazyny"
GARAGE = "garaz"


class AuctionType(Enum):
SALE = "sprzedaz"
RENT = "wynajem"


class Defaults:
"""
A class that provides default values for the settings used by the application.
Expand All @@ -23,22 +40,5 @@ class Defaults:
DEFAULT_PROVINCE = "mazowieckie"
DEFAULT_CITY = "warszawa"
DEFAULT_DISTRICT = None
DEFAULT_PROPERTY_TYPE = "mieszkanie"
DEFAULT_AUCTION_TYPE = "sprzedaz"


class PropertyType(Enum):
FLAT = "mieszkanie"
STUDIO = "kawalerka"
HOUSE = "dom"
INVESTMENT = "inwestycja"
ROOM = "pokoj"
PLOT = "dzialka"
VENUE = "lokal"
MAGAZINE = "haleimagazyny"
GARAGE = "garaz"


class AuctionType(Enum):
SALE = "sprzedaz"
RENT = "wynajem"
DEFAULT_PROPERTY_TYPE = PropertyType.FLAT
DEFAULT_AUCTION_TYPE = AuctionType.SALE
Loading

0 comments on commit b0064e8

Please sign in to comment.