Skip to content

Commit

Permalink
Fix CamelCase, add pre-commit dependency
Browse files Browse the repository at this point in the history
  • Loading branch information
Valentine-456 committed Dec 6, 2023
1 parent 8f94f6e commit c305520
Show file tree
Hide file tree
Showing 4 changed files with 78 additions and 67 deletions.
3 changes: 2 additions & 1 deletion otodom/task_1/Valentyn/dev-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
black
flake8
flake8
pre-commit
47 changes: 26 additions & 21 deletions otodom/task_1/Valentyn/src/HouseItem.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ def __init__(self, url: str):
}

def convert_to_ascii(self, text):
normalized = unicodedata.normalize("NFKD", text.replace('ł', 'l'))
normalized = unicodedata.normalize("NFKD", text.replace("ł", "l"))
ascii_text = normalized.encode("ascii", "ignore").decode("ascii")
return ascii_text

def setPrice(self, priceStr: str):
clean_price = priceStr.strip().replace(" zł", "").replace(" ", "")
def set_price(self, price_str: str):
clean_price = price_str.strip().replace(" zł", "").replace(" ", "")
try:
converted_price = int(clean_price)
self.dictionary["price"] = converted_price
Expand All @@ -37,38 +37,43 @@ def setPrice(self, priceStr: str):
finally:
return self

def setTitle(self, title: str):
def set_title(self, title: str):
self.dictionary["title"] = self.convert_to_ascii(title)
return self

def setArea(self, area: str):
cleanNumber = area.strip().split(" ")[0].replace(",", ".").replace("\xa0", "")
self.dictionary["area"] = int(float(cleanNumber)) if cleanNumber != "" else None
def set_area(self, area: str):
clean_number = area.strip().split(" ")[0]
clean_number = clean_number.replace(",", ".").replace("\xa0", "")
self.dictionary["area"] = (
int(float(clean_number)) if clean_number != "" else None
)
return self

def setRooms(self, rooms: str):
self.dictionary["rooms"] = int(rooms) if (rooms.strip() != "") else None
def set_rooms(self, rooms: str):
self.dictionary["rooms"] = None
if rooms.strip() != "":
self.dictionary["rooms"] = int(rooms)
return self

def setLocalization(self, address: str):
addressList = address.split(", ")
if len(addressList) >= 5:
street = self.convert_to_ascii(addressList[-5])
def set_localization(self, address: str):
address_list = address.split(", ")
if len(address_list) >= 5:
street = self.convert_to_ascii(address_list[-5])
self.dictionary["localization"]["street"] = street
if len(addressList) >= 4:
district = self.convert_to_ascii(addressList[-3])
if len(address_list) >= 4:
district = self.convert_to_ascii(address_list[-3])
self.dictionary["localization"]["district"] = district
if len(addressList) >= 3:
city = self.convert_to_ascii(addressList[-2])
if len(address_list) >= 3:
city = self.convert_to_ascii(address_list[-2])
self.dictionary["localization"]["city"] = city
if len(addressList) >= 1:
province = self.convert_to_ascii(addressList[-1])
if len(address_list) >= 1:
province = self.convert_to_ascii(address_list[-1])
self.dictionary["localization"]["province"] = province
return self

def setEstateAgency(self, agency: str):
def set_estate_agency(self, agency: str):
self.dictionary["estate_agency"] = self.convert_to_ascii(agency)
return self

def toDictionary(self):
def to_dictionary(self):
return self.dictionary
91 changes: 48 additions & 43 deletions otodom/task_1/Valentyn/src/Scraper.py
Original file line number Diff line number Diff line change
@@ -1,82 +1,87 @@
import requests
from bs4 import BeautifulSoup, Tag
from bs4 import BeautifulSoup
from HouseItem import HouseItem


class Scraper:
TARGET_URL = "https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/mazowieckie/warszawa/warszawa?limit=72"
TARGET_URL = (
"https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/mazowieckie/"
"warszawa/warszawa?limit=72"
)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
" AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}

def __init__(self):
self.itemLinksVisited = []
self.item_links_visited = []

def startScraping(self, limit: int = 200) -> [HouseItem]:
def start_scraping(self, limit: int = 200) -> [HouseItem]:
response = requests.get(self.TARGET_URL, headers=self.headers)
if response.status_code // 100 != 2:
print("ERROR: Could not load initial page")
return []

soup = BeautifulSoup(response.content, features="html.parser")
lastPageElem = soup.select_one('nav[data-cy="pagination"] a:last-of-type')
if lastPageElem is None:
last_page_elem = soup.select_one('nav[data-cy="pagination"] a:last-of-type')
if last_page_elem is None:
print("ERROR: no pagination found on the page")
return []

scrapedPages = []
maxPage = int(lastPageElem.getText())
for page in range(0, maxPage):
scraped_pages = []
max_page = int(last_page_elem.getText())
for page in range(0, max_page):
print(f"> Start scraping page {page+1}...")
scrapedPages = scrapedPages + self.getPage(page + 1)
print(f"> Scraped {len(scrapedPages)} objects")
if len(scrapedPages) >= limit:
scraped_pages = scraped_pages + self.get_page(page + 1)
print(f"> Scraped {len(scraped_pages)} objects")
if len(scraped_pages) >= limit:
break
return scrapedPages
return scraped_pages

def getPage(self, page: int = 1) -> [HouseItem]:
def get_page(self, page: int = 1) -> [HouseItem]:
response = requests.get(f"{self.TARGET_URL}&page={page}", headers=self.headers)
if response.status_code // 100 != 2:
print(f"ERROR: could not load new page, status: {response.status_code}")
print(f"ERROR: couldnt load new page, status: {response.status_code}")
return []

soup = BeautifulSoup(response.content, features="html.parser")
linkElems = soup.select('a[href^="/pl/oferta/"]')
links = [elem.get("href") for elem in linkElems]
link_elems = soup.select('a[href^="/pl/oferta/"]')
links = [elem.get("href") for elem in link_elems]

linksFiltered = list(filter(self.filterRepeatedLinks, links))
self.itemLinksVisited.extend(linksFiltered)
return list(map(self.mapHouseItemLinks, linksFiltered))
links_filtered = list(filter(self.filter_repeated_links, links))
self.item_links_visited.extend(links_filtered)
return list(map(self.map_house_item_links, links_filtered))

def filterRepeatedLinks(self, suburl: str):
return suburl not in self.itemLinksVisited
def filter_repeated_links(self, suburl: str):
return suburl not in self.item_links_visited

def mapHouseItemLinks(self, suburl: str):
return self.scrapHouseItem(f"{HouseItem.base_url}{suburl}")
def map_house_item_links(self, suburl: str):
return self.scrap_house_item(f"{HouseItem.base_url}{suburl}")

def scrapHouseItem(self, url: str):
def scrap_house_item(self, url: str):
response = requests.get(url, headers=self.headers)
if response.status_code // 100 != 2:
print(f"ERROR: could not load item page, status: {response.status_code}")
print(f"ERROR: didnt load item page, status: {response.status_code}")

soup = BeautifulSoup(response.content, features="html.parser")
titleElem = soup.select_one('h1[data-cy="adPageAdTitle"]')
title = titleElem.getText() if titleElem is not None else ""
priceElem = soup.select_one('[data-cy="adPageHeaderPrice"]')
price = priceElem.getText() if priceElem is not None else ""
areaElem = soup.select_one(
title_elem = soup.select_one('h1[data-cy="adPageAdTitle"]')
title = title_elem.getText() if title_elem else ""
price_elem = soup.select_one('[data-cy="adPageHeaderPrice"]')
price = price_elem.getText() if price_elem else ""
area_elem = soup.select_one(
'[aria-label="Powierzchnia"] [data-testid="table-value-area"]'
)
area = areaElem.getText() if areaElem is not None else ""
roomsElem = soup.select_one('[aria-label="Liczba pokoi"] a')
rooms = roomsElem.getText() if roomsElem is not None else ""
localElem = soup.select_one('a[href="#map"][aria-label="Adres"]')
localization = localElem.getText() if localElem is not None else ""
agencyElem = soup.select_one(
'[aria-label="Typ ogłoszeniodawcy"] [data-testid="table-value-advertiser_type"]'
area = area_elem.getText() if area_elem else ""
rooms_elem = soup.select_one('[aria-label="Liczba pokoi"] a')
rooms = rooms_elem.getText() if rooms_elem else ""
local_elem = soup.select_one('a[href="#map"][aria-label="Adres"]')
localization = local_elem.getText() if local_elem else ""
agency_elem = soup.select_one(
"""[aria-label="Typ ogłoszeniodawcy"]
[data-testid="table-value-advertiser_type"]"""
)
agency = agencyElem.getText() if agencyElem is not None else ""
agency = agency_elem.getText() if agency_elem else ""

item = HouseItem(url).setPrice(price).setTitle(title).setArea(area)
item.setLocalization(localization).setRooms(rooms).setEstateAgency(agency)
return item
item = HouseItem(url).set_price(price).set_title(title).set_area(area)
item.set_localization(localization).set_rooms(rooms)
return item.set_estate_agency(agency)
4 changes: 2 additions & 2 deletions otodom/task_1/Valentyn/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
import json

scraper = Scraper()
houseItems = [i.toDictionary() for i in scraper.startScraping(limit=220)]
house_items = [i.to_dictionary() for i in scraper.start_scraping(limit=220)]

# Convert the list of dictionaries to a JSON string
json_data = json.dumps(houseItems, indent=4)
json_data = json.dumps(house_items, indent=4)

# Write the JSON data to a file
with open("data.json", "w") as file:
Expand Down

0 comments on commit c305520

Please sign in to comment.