From 8f94f6ee77ae34ca9f3cc9f8cd9ae10fbbb032d6 Mon Sep 17 00:00:00 2001 From: Valentyn Bondarenko Date: Mon, 4 Dec 2023 13:39:17 +0100 Subject: [PATCH] Task 1 - fix nonascii chars parsing --- otodom/task_1/Valentyn/settings.json | 9 +++++++++ otodom/task_1/Valentyn/src/HouseItem.py | 26 ++++++++++++++++++------- 2 files changed, 28 insertions(+), 7 deletions(-) create mode 100644 otodom/task_1/Valentyn/settings.json diff --git a/otodom/task_1/Valentyn/settings.json b/otodom/task_1/Valentyn/settings.json new file mode 100644 index 0000000..c631d14 --- /dev/null +++ b/otodom/task_1/Valentyn/settings.json @@ -0,0 +1,9 @@ +{ + "base_url": "str", + "price_min": "str", + "price_max": "str", + "city": "str", + "property_type": "str", + "only_for_sale": "bool", + "only_for_rent": "bool" +} \ No newline at end of file diff --git a/otodom/task_1/Valentyn/src/HouseItem.py b/otodom/task_1/Valentyn/src/HouseItem.py index 017a416..8f3b515 100644 --- a/otodom/task_1/Valentyn/src/HouseItem.py +++ b/otodom/task_1/Valentyn/src/HouseItem.py @@ -1,3 +1,6 @@ +import unicodedata + + class HouseItem: base_url = "https://www.otodom.pl" @@ -19,6 +22,11 @@ def __init__(self, url: str): }, } + def convert_to_ascii(self, text): + normalized = unicodedata.normalize("NFKD", text.replace('ł', 'l')) + ascii_text = normalized.encode("ascii", "ignore").decode("ascii") + return ascii_text + def setPrice(self, priceStr: str): clean_price = priceStr.strip().replace(" zł", "").replace(" ", "") try: @@ -30,11 +38,11 @@ def setPrice(self, priceStr: str): return self def setTitle(self, title: str): - self.dictionary["title"] = title + self.dictionary["title"] = self.convert_to_ascii(title) return self def setArea(self, area: str): - cleanNumber = area.strip().split(" ")[0].replace(",", ".") + cleanNumber = area.strip().split(" ")[0].replace(",", ".").replace("\xa0", "") self.dictionary["area"] = int(float(cleanNumber)) if cleanNumber != "" else None return self @@ -45,17 +53,21 @@ def setRooms(self, rooms: str): def setLocalization(self, address: str): addressList = address.split(", ") if len(addressList) >= 5: - self.dictionary["localization"]["street"] = addressList[-5] + street = self.convert_to_ascii(addressList[-5]) + self.dictionary["localization"]["street"] = street if len(addressList) >= 4: - self.dictionary["localization"]["district"] = addressList[-3] + district = self.convert_to_ascii(addressList[-3]) + self.dictionary["localization"]["district"] = district if len(addressList) >= 3: - self.dictionary["localization"]["city"] = addressList[-2] + city = self.convert_to_ascii(addressList[-2]) + self.dictionary["localization"]["city"] = city if len(addressList) >= 1: - self.dictionary["localization"]["province"] = addressList[-1] + province = self.convert_to_ascii(addressList[-1]) + self.dictionary["localization"]["province"] = province return self def setEstateAgency(self, agency: str): - self.dictionary["estate_agency"] = agency + self.dictionary["estate_agency"] = self.convert_to_ascii(agency) return self def toDictionary(self):