Skip to content

Commit

Permalink
Deploy 0.1.1 (#28)
Browse files Browse the repository at this point in the history
  • Loading branch information
Yan-ni authored Feb 22, 2025
2 parents 56c3e0e + be93641 commit 9c95d38
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 16 deletions.
11 changes: 2 additions & 9 deletions src/helper/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,26 +59,19 @@ def scrape_jobs(args):
)
logging.debug("{:<8} {:<8}".format("Page", "Job offers"))

while True:
while search_page is not None:
page_job_offers_urls = search_page.get_jobs_offers_urls()

logging.debug(
logging.info(
"{:<8} {:<8}".format(
search_page.get_page_number(), len(page_job_offers_urls)
)
)

total_job_offers_urls = total_job_offers_urls.union(page_job_offers_urls)

if len(page_job_offers_urls) < 30:
break

search_page = search_page.next_page()

logging.info(
f"retrieved {len(total_job_offers_urls)} job offers from {search_page.get_page_number()} pages."
)

db_cursor.close()
db_connection.close()

Expand Down
3 changes: 2 additions & 1 deletion src/scraper/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class Scraper:
"""A representation of the scraper that is a combination of selenium and beautiful soup."""

@staticmethod
def get_url_soup(url: str) -> BeautifulSoup:
def get_url_soup(url: str, waitCheck: str = "true") -> BeautifulSoup:
"""Fetch the url and return its page source soup."""
chrome_options = Options()
chrome_options.add_argument("--headless=new")
Expand All @@ -51,6 +51,7 @@ def get_url_soup(url: str) -> BeautifulSoup:
lambda driver: driver.execute_script("return document.readyState")
== "complete"
and wait_for_all_requests_to_complete(driver)
and driver.execute_script(f"return {waitCheck}")
)

return BeautifulSoup(browser.page_source, "html.parser")
Expand Down
35 changes: 29 additions & 6 deletions src/welcome_to_the_jungle/search_page.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from scraper import Scraper
from database import ScrapeDB
import math


class SearchPage:
max_pages: int = None
"""A representation of (welcome to the jungle) search page.
Attributes:
Expand Down Expand Up @@ -34,11 +36,11 @@ def __init__(
def save_scrape_to_db(self):
self.__db_cur.execute(
"""UPDATE scrapes SET
query = %(query)s,
contract_type = %(contract_type)s,
location = %(location)s,
country_code = %(country_code)s
WHERE id=%(scrape_id)s""",
query = %(query)s,
contract_type = %(contract_type)s,
location = %(location)s,
country_code = %(country_code)s
WHERE id=%(scrape_id)s""",
{
"query": self.get_query(),
"contract_type": self.get_contract_type(),
Expand Down Expand Up @@ -67,12 +69,30 @@ def get_page_number(self):
def get_url(self) -> str:
return f"https://www.welcometothejungle.com/en/jobs?refinementList[offices.country_code][]={self.get_country_code()}&refinementList[contract_type][]={self.get_contract_type()}&query={self.get_query()}&page={self.get_page_number()}&aroundQuery={self.get_location()}&searchTitle=true"

def set_max_pages(self, soup) -> None:
number_job_offers_tag = soup.select_one(
"[data-testid='jobs-search-results-count']"
)
number_job_offers: int = int(number_job_offers_tag.get_text(""))

SearchPage.max_pages = math.ceil(number_job_offers / 30)

@classmethod
def get_max_pages(cls) -> int:
return cls.max_pages

def get_jobs_offers_urls(self) -> set[str]:
"""Return a set of job urls present in the current search page."""
soup = Scraper.get_url_soup(self.get_url())
soup = Scraper.get_url_soup(
self.get_url(),
waitCheck="document.querySelectorAll(\"[data-testid='search-results-list-item-wrapper']\").length > 0",
)
elements = soup.select("li > div > div > a")
jobs_urls = set()

if self.page_number == 1:
self.set_max_pages(soup)

for element in elements:
if "href" in element.attrs:
jobs_urls.add(f"https://www.welcometothejungle.com{element['href']}")
Expand All @@ -81,6 +101,9 @@ def get_jobs_offers_urls(self) -> set[str]:

def next_page(self):
"""Return next search page"""
if self.page_number + 1 > self.get_max_pages():
return None

return SearchPage(
page_number=(self.page_number + 1),
db_cursor=self.__db_cur,
Expand Down

0 comments on commit 9c95d38

Please sign in to comment.