Skip to content

Commit 8f94f6e

Browse files
committed
Task 1 - fix nonascii chars parsing
1 parent 3eb0a18 commit 8f94f6e

File tree

2 files changed

+28
-7
lines changed

2 files changed

+28
-7
lines changed

otodom/task_1/Valentyn/settings.json

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{
2+
"base_url": "str",
3+
"price_min": "str",
4+
"price_max": "str",
5+
"city": "str",
6+
"property_type": "str",
7+
"only_for_sale": "bool",
8+
"only_for_rent": "bool"
9+
}

otodom/task_1/Valentyn/src/HouseItem.py

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
import unicodedata
2+
3+
14
class HouseItem:
25
base_url = "https://www.otodom.pl"
36

@@ -19,6 +22,11 @@ def __init__(self, url: str):
1922
},
2023
}
2124

25+
def convert_to_ascii(self, text):
26+
normalized = unicodedata.normalize("NFKD", text.replace('ł', 'l'))
27+
ascii_text = normalized.encode("ascii", "ignore").decode("ascii")
28+
return ascii_text
29+
2230
def setPrice(self, priceStr: str):
2331
clean_price = priceStr.strip().replace(" zł", "").replace(" ", "")
2432
try:
@@ -30,11 +38,11 @@ def setPrice(self, priceStr: str):
3038
return self
3139

3240
def setTitle(self, title: str):
33-
self.dictionary["title"] = title
41+
self.dictionary["title"] = self.convert_to_ascii(title)
3442
return self
3543

3644
def setArea(self, area: str):
37-
cleanNumber = area.strip().split(" ")[0].replace(",", ".")
45+
cleanNumber = area.strip().split(" ")[0].replace(",", ".").replace("\xa0", "")
3846
self.dictionary["area"] = int(float(cleanNumber)) if cleanNumber != "" else None
3947
return self
4048

@@ -45,17 +53,21 @@ def setRooms(self, rooms: str):
4553
def setLocalization(self, address: str):
4654
addressList = address.split(", ")
4755
if len(addressList) >= 5:
48-
self.dictionary["localization"]["street"] = addressList[-5]
56+
street = self.convert_to_ascii(addressList[-5])
57+
self.dictionary["localization"]["street"] = street
4958
if len(addressList) >= 4:
50-
self.dictionary["localization"]["district"] = addressList[-3]
59+
district = self.convert_to_ascii(addressList[-3])
60+
self.dictionary["localization"]["district"] = district
5161
if len(addressList) >= 3:
52-
self.dictionary["localization"]["city"] = addressList[-2]
62+
city = self.convert_to_ascii(addressList[-2])
63+
self.dictionary["localization"]["city"] = city
5364
if len(addressList) >= 1:
54-
self.dictionary["localization"]["province"] = addressList[-1]
65+
province = self.convert_to_ascii(addressList[-1])
66+
self.dictionary["localization"]["province"] = province
5567
return self
5668

5769
def setEstateAgency(self, agency: str):
58-
self.dictionary["estate_agency"] = agency
70+
self.dictionary["estate_agency"] = self.convert_to_ascii(agency)
5971
return self
6072

6173
def toDictionary(self):

0 commit comments

Comments
 (0)