-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser.py
124 lines (95 loc) · 4.1 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import asyncio
import csv
import re
from dataclasses import dataclass, fields, astuple
from urllib.parse import urljoin
import aiohttp
from bs4 import BeautifulSoup
URL = "https://djinni.co/jobs/"
@dataclass
class Job:
title: str
company: str
salary: int
technologies: list[str]
location: list[str]
JOB_FIELDS = [field.name for field in fields(Job)]
def salary_to_avg_int(salary_str: str) -> int:
"""
Converts a salary string in the format '$XX,XXX - $YY,YYY' or '$ZZZ,ZZZ' to an integer
representing the average salary.
:param salary_str: A salary string in the format '$XX,XXX - $YY,YYY' or '$ZZZ,ZZZ'.
:return int: The average salary as an integer.
:raise ValueError: If the input string is not in a valid format.
"""
salary_str = salary_str.replace("$", "").replace(",", "").replace(" ", "")
if "-" in salary_str:
min_salary, max_salary = salary_str.split("-")
min_salary = re.findall(r"\d+", min_salary)
max_salary = re.findall(r"\d+", max_salary)
return (int(min_salary[0]) + int(max_salary[0])) // 2
return int(re.findall(r"\d+", salary_str)[0])
async def get_job_info(session: aiohttp.ClientSession, url: str) -> Job:
"""
Fetches job information from a given URL using an aiohttp session.
:param session: An aiohttp ClientSession object to use for making HTTP requests.
:param url: The URL of the job posting to fetch.
:return Job: A Job object representing the job information fetched from the URL.
:raise Exception: If there is an error fetching or parsing the job information.
"""
async with session.get(urljoin(URL, url), ssl=False) as response:
soup = BeautifulSoup(await response.text(), "html.parser")
title = soup.select_one(".detail--title-wrapper > h1")
company = soup.select_one(".job-details--title")
salary = soup.select_one(".public-salary-item")
if salary:
salary = salary_to_avg_int(salary_str=salary.text)
tech_spans = soup.select("li:contains('Категорія:') + li span")
tech_list = [span.text for span in tech_spans[1:]]
location_spans = (
soup.select_one("span.location-text").contents[0].split(",")
)
location_list = [span.strip() for span in location_spans]
return Job(
title=title.contents[0].strip() if title else title,
company=company.text.strip() if company else company,
salary=salary,
technologies=tech_list or None,
location=location_list,
)
async def parser() -> list[Job]:
"""
Parses job information from multiple pages of job postings and returns a list of Job objects.
:return List[Job]: A list of Job objects representing the job information parsed from the job postings.
:raise Exception: If there is an error fetching or parsing the job information.
"""
page_url = "?primary_keyword=Python&page=1"
job_urls = []
async with aiohttp.ClientSession(trust_env=True) as session:
while True:
async with session.get(
urljoin(URL, page_url), ssl=False
) as response:
soup = BeautifulSoup(await response.text(), "html.parser")
job_urls.extend([a["href"] for a in soup.select("a.profile")])
try:
page_url = soup.select_one(".d-md-none > a.btn-lg")["href"]
except TypeError:
break
return await asyncio.gather(
*[get_job_info(session, url) for url in job_urls]
)
def write_to_csv(job_list: list[Job], csv_path: str) -> None:
"""
Writes the list of job objects to a CSV file at the specified path.
:param job_list: A list of Job objects.
:param csv_path: The path to the output CSV file.
:return: None
"""
with open(csv_path, "w") as file:
writer = csv.writer(file)
writer.writerow(JOB_FIELDS)
writer.writerows([astuple(job) for job in job_list])
if __name__ == "__main__":
job_list = asyncio.run(parser())
write_to_csv(job_list=job_list, csv_path="jobs.csv")