From 47bd17a5471f379517469efa199d14f8d21a4025 Mon Sep 17 00:00:00 2001 From: Waheeb Barqawi Date: Fri, 3 Jan 2025 23:12:30 -0500 Subject: [PATCH 1/2] web scraping functionality --- .gitignore | 2 +- core/scraper.py | 362 ++-------------------------- core/scraping/abstractdatasource.py | 40 +++ core/scraping/devpost.py | 102 ++++++++ core/scraping/ethglobal.py | 96 ++++++++ core/scraping/hackclubsource.py | 100 ++++++++ core/scraping/mlh.py | 65 +++++ core/urls.py | 1 + core/views.py | 7 + 9 files changed, 426 insertions(+), 349 deletions(-) create mode 100644 core/scraping/abstractdatasource.py create mode 100644 core/scraping/devpost.py create mode 100644 core/scraping/ethglobal.py create mode 100644 core/scraping/hackclubsource.py create mode 100644 core/scraping/mlh.py diff --git a/.gitignore b/.gitignore index e03820e..c5b523e 100644 --- a/.gitignore +++ b/.gitignore @@ -15,4 +15,4 @@ venv celerybeat-* -settings/environments/local.py +settings/environments/local.py \ No newline at end of file diff --git a/core/scraper.py b/core/scraper.py index 8df5999..aeaaf72 100644 --- a/core/scraper.py +++ b/core/scraper.py @@ -1,355 +1,21 @@ -import re -import datetime -import requests -import itertools -import cloudscraper -from django.conf import settings -from django.utils import timezone -from bs4 import BeautifulSoup - -from core.models import ( - Hackathon, - HackathonSource, - HackathonLocation, - ReviewStatus, -) - - -class AbstractDataSource: - URL = "" - - def scrape_page(self, **kwargs): - raise NotImplementedError - - def parse_event(self, ev, **kwargs): - raise NotImplementedError - - def get_events(self, **kwargs): - evs = self.scrape_page(**kwargs) - ans = [self.parse_event(ev, **kwargs) for ev in evs] - return ans - - -class MLHSource(AbstractDataSource): - URL = "https://mlh.io/seasons/{}/events" - - def scrape_page(self, **kwargs): - scraper = cloudscraper.create_scraper() - r = scraper.get( - self.URL.format(kwargs.get("year", datetime.datetime.now().year)) - ) - - page = BeautifulSoup(r.text, "html.parser") - divs = page.find_all("div", {"class": "row"}) - - return divs[1].find_all("a", {"class": "event-link"}) + divs[2].find_all( - "a", {"class": "event-link"} - ) - - def parse_event(self, ev, **kwargs): - loc = ev.find_all("div", {"class": "event-location"})[0] - loc_data = ( - loc.find_all("span", {"itemprop": "city"})[0].contents[0] - + ", " - + loc.find_all("span", {"itemprop": "state"})[0].contents[0] - ) - name = ev.find_all("h3", {"class": "event-name"})[0].contents[0] - end_date = timezone.make_aware( - datetime.datetime.strptime( - ev.find_all("meta", {"itemprop": "endDate"})[0]["content"], "%Y-%m-%d" - ) - ) - if end_date > timezone.now(): - start_date = timezone.make_aware( - datetime.datetime.strptime( - ev.find_all("meta", {"itemprop": "startDate"})[0]["content"], - "%Y-%m-%d", - ) - ) - - hackathonLocation_input, created = HackathonLocation.objects.get_or_create( - name=loc_data, - country=None, - location=None, - ) - - evinfo = { - "name": name.rstrip(), - "start_date": start_date, - "end_date": end_date, - "location": hackathonLocation_input, - "hybrid": ev.find_all("div", {"class": "event-hybrid-notes"})[0] - .find_all("span")[0] - .contents[0][0], - "maximum_education_level": 1 - if len(ev.find_all("div", {"class": "ribbon"})) > 0 - and len(ev.find_all("div", {"class": "diversity-event-badge"})) == 0 - else 5, - "website": ev["href"], - "bg_image": ev.find_all("div", {"class": "image-wrap"})[0].find_all( - "img" - )[0]["src"], - "fg_image": ev.find_all("div", {"class": "event-logo"})[0].find_all( - "img" - )[0]["src"], - "is_diversity": len( - ev.find_all("div", {"class": "diversity-event-badge"}) - ) - > 0, - "source": HackathonSource.Scraped, - "scrape_source": "mlh", - "is_public": True, - } - - return evinfo - - -class DevpostSource(AbstractDataSource): - URL = "https://devpost.com/api/hackathons?status[]=upcoming&status[]=open" - - def scrape_page(self, **kwargs): - evs = [] - total = 1 - cur = 0 - i = 0 - while cur < total: - r = requests.get(self.URL + (f"&page={i}" if i > 1 else "")) - res = r.json() - evs += res["hackathons"] - total = res["meta"]["total_count"] - cur += res["meta"]["per_page"] - - return evs - - def parse_event(self, ev, **kwargs): - dates = ev["submission_period_dates"].split("-") - startdate = dates[0].strip() - enddate = dates[1].strip() - if len(startdate.split(" ")) == 2: - startdate += enddate[-6:] - if len(enddate.split(" ")) == 2: - enddate = startdate.split(" ")[0] + " " + enddate - startdate = timezone.make_aware( - datetime.datetime.strptime(startdate, "%b %d, %Y") - ) - - enddate = timezone.make_aware(datetime.datetime.strptime(enddate, "%b %d, %Y")) - - if enddate > timezone.now(): - loc = ev["displayed_location"]["location"] - - hackathonLocation_input, created = HackathonLocation.objects.get_or_create( - name=loc, - country=None, - location=None, - ) - - evinfo = { - "name": ev["title"].rstrip(), - "start_date": startdate, - "end_date": enddate, - "location": hackathonLocation_input, - "hybrid": "O" - if ev["displayed_location"]["location"] == "Online" - else "I", - "website": ev["url"], - "fg_image": ev["thumbnail_url"], - "is_restricted": ev["open_state"] != "open", - "source": HackathonSource.Scraped, - "scrape_source": "dev", - "is_public": True, - } - - return evinfo - - -class EthGlobalSource(AbstractDataSource): - URL = "https://ethglobal.com/events/hackathons" - - def scrape_page(self, **kwargs): - r = requests.get(self.URL) - page = BeautifulSoup(r.text, features="html.parser") - return page.select('a[href^="/events/"]') - - def parse_event(self, ev, **kwargs): - if ( - ev.get("href") == "/events/hackathons" - or ev.get("href") == "/events/summits" - ): - return None - - name = ev.find_all("h3")[0].contents[0] - try: - startdate = ev.find_all("time")[0].contents[0] - enddate = ev.find_all("time")[1].contents[0] - except IndexError: - return {} - startdate = re.sub(r"(\d+)(st|nd|rd|th)", r"\1", startdate) - enddate = re.sub(r"(\d+)(st|nd|rd|th)", r"\1", enddate) - startdate = datetime.datetime.strptime(startdate, "%b %d, %Y") - enddate = datetime.datetime.strptime(enddate, "%b %d, %Y") - - end_date = timezone.make_aware(enddate) - - if end_date > timezone.now(): - loc = ", ".join(name.split()[1:]) - - hackathonLocation_input, created = HackathonLocation.objects.get_or_create( - name=loc, - country=None, - location=None, - ) - - evinfo = { - "name": name.rstrip(), - "start_date": startdate, - "end_date": enddate, - "location": hackathonLocation_input, - "website": "https://ethglobal.com" + ev.get("href"), - "fg_image": ev.find_all("img")[0]["src"], - "source": HackathonSource.Scraped, - "scrape_source": "eth", - "is_public": True, - } - - return evinfo - - -class HackClubSource(AbstractDataSource): - URL = "https://hackathons.hackclub.com/" - - def scrape_page(self, **kwargs): - r = requests.get(self.URL) - - page = BeautifulSoup(r.text, features="html.parser") - return page.find_all("div", {"class": "css-4jawwy"})[0].find_all("a") - - def parse_event(self, ev, **kwargs): - try: - loc = ev.find_all("span", {"itemprop": "address"})[0].contents[2] - except IndexError: - loc = "" - - end_date = timezone.make_aware( - datetime.datetime.strptime( - ev.find_all("span", {"itemprop": "endDate"})[0]["content"].split("T")[ - 0 - ], - "%Y-%m-%d", - ) - ) - if end_date > timezone.now(): - name = ev.find_all("h3")[0].contents[0] - - hackathonLocation_input, created = HackathonLocation.objects.get_or_create( - name=loc, - country=None, - location=None, - ) - - evinfo = { - "name": name.rstrip(), - "start_date": datetime.datetime.strptime( - ev.find_all("span", {"itemprop": "startDate"})[0]["content"].split( - "T" - )[0], - "%Y-%m-%d", - ), - "end_date": end_date, - "location": hackathonLocation_input, - "hybrid": ev.find_all("span", {"itemtype": "VirtualLocation"})[ - 0 - ].contents[0][0], - "maximum_education_level": 1, - "website": ev["href"], - "fg_image": ev.find_all("img")[0]["src"], - "source": HackathonSource.Scraped, - "scrape_source": "hcl", - "is_public": True, - } - - return evinfo +from core.scraping.devpost import DevpostSource +from core.scraping.ethglobal import EthGlobalSource +from core.scraping.hackclubsource import HackClubSource +from core.scraping.mlh import MLHSource def scrape_all(num): if num == 1: - evs = ( - itertools.chain.from_iterable( - [MLHSource().get_events(year=i) for i in settings.CUR_YEAR] - ) - if type(settings.CUR_YEAR) is list - else MLHSource().get_events(year=settings.CUR_YEAR) - ) + src = MLHSource() + evs = src.get_events() if num == 2: - evs = DevpostSource().get_events() + src = DevpostSource() + evs = src.get_events() if num == 3: - evs = EthGlobalSource().get_events() - else: - pass - # evs = itertools.chain( - # mlh, - # DevpostSource().get_events(), - # EthGlobalSource().get_events(), - # ) - - for ev in evs: - if ev == {}: - continue - if ev is not None: - end_date = ev["end_date"] - if timezone.is_naive(end_date): - end_date = timezone.make_aware(end_date) - - ev["duplication_id"] = ev["name"].lower().replace( - " ", "" - ) + end_date.strftime("-%Y") - - hackathon = Hackathon.objects.filter( - duplication_id=ev["duplication_id"] - ).first() - - if hackathon is not None: - pass - # hackathon.start_date = ev["start_date"] - # hackathon.end_date = end_date - # hackathon.location = ev["location"] - # hackathon.source = HackathonSource.Scraped - # hackathon.scrape_source = ev["scrape_source"] - # hackathon.review_status = ReviewStatus.Approved - # hackathon.is_public = True - # hackathon.duplication_id = ev["duplication_id"] - # hackathon.save() - else: - hackathon = Hackathon() - for attr, value in ev.items(): - setattr(hackathon, attr, value) - hackathon.review_status = ReviewStatus.Approved - hackathon.duplication_id = ev["duplication_id"] - - hackathon.save() - - -def extract_text_from_url(url): - # Send a GET request to fetch the content of the webpage - scraper = cloudscraper.create_scraper() - response = scraper.get(url) - - # Check if the request was successful - if response.status_code != 200: - raise Exception( - f"Failed to fetch the page. Status code: {response.status_code}" - ) - - # Parse the webpage content - soup = BeautifulSoup(response.content, features="html.parser") - - # Extract text from paragraphs and headings (can be adjusted based on the webpage structure) - text = [] - for tag in soup.find_all(["p", "h1", "h2", "h3", "h4", "h5", "h6"]): - if tag.get_text(strip=True): - text.append(tag.get_text(strip=True)) - - # Join the list into a single string - clean_text = "\n".join(text) + src = EthGlobalSource() + evs = src.get_events() + if num == 4: + src = HackClubSource() + evs = src.get_events() - return clean_text + src.save(evs) diff --git a/core/scraping/abstractdatasource.py b/core/scraping/abstractdatasource.py new file mode 100644 index 0000000..0ac34b0 --- /dev/null +++ b/core/scraping/abstractdatasource.py @@ -0,0 +1,40 @@ +import json + + +def add_to_gitignore(file_path): + gitignore_path = ".gitignore" + + try: + with open(gitignore_path, "r") as gitignore_file: + lines = gitignore_file.readlines() + except FileNotFoundError: + lines = [] + file_to_add = file_path + if file_to_add + "\n" not in lines: + lines.append("\n" + file_to_add + "\n") + print(f"Added '{file_to_add}' to .gitignore") + else: + print(f"'{file_to_add}' is already in .gitignore.") + with open(gitignore_path, "w") as gitignore_file: + gitignore_file.writelines(lines) + + +class AbstractDataSource: + URL = "" + + def scrape_page(self): + raise NotImplementedError + + def parse_event(self, ev): + raise NotImplementedError + + def get_events(self): + events = self.scrape_page() + return [self.parse_event(ev) for ev in events] + + def save(self, evs): + print("moments before disaster", evs) + file_name = f"{self.__class__.__name__}.json" + add_to_gitignore(file_name) + with open(file_name, "w") as json_file: + json.dump(evs, json_file, indent=4) diff --git a/core/scraping/devpost.py b/core/scraping/devpost.py new file mode 100644 index 0000000..11eba84 --- /dev/null +++ b/core/scraping/devpost.py @@ -0,0 +1,102 @@ +import cloudscraper +from datetime import datetime +from core.scraping.abstractdatasource import AbstractDataSource + + +class DevpostSource(AbstractDataSource): + URL = "https://devpost.com/api/hackathons?status[]=upcoming&status[]=open" + + def scrape_page(self, **kwargs): + page = kwargs.get("page", 1) + scraper = cloudscraper.create_scraper() # Create a Cloudscraper instance + response = scraper.get(f"{self.URL}&page={page}") + if response.status_code != 200: + return [], 0, "invalid" + data = response.json() + return ( + data.get("hackathons", []), + data.get("meta", {}).get("total_count", 0), + "valid", + ) + + def get_events(self, **kwargs): + all_events = [] + page = 1 + total_count = 1 + while len(all_events) < total_count: + events, temp, status = self.scrape_page(page=page, **kwargs) + if (not events) and status == "valid": + break + if not events: + page += 1 + continue + total_count = temp + all_events.extend(events) + page += 1 + return [self.parse_event(ev, **kwargs) for ev in all_events] + + def parse_event(self, ev, **kwargs): + submission_dates = ev.get("submission_period_dates", "").split(" - ") + start_date_str = submission_dates[0] if len(submission_dates) > 0 else "" + end_date_str = ( + submission_dates[1] if len(submission_dates) > 1 else start_date_str + ) + current_year = datetime.now().year + start_date, end_date = self.parse_date( + start_date_str, end_date_str, current_year + ) + location = ev.get("displayed_location", {}).get("location", "Unknown Location") + return { + "name": ev.get("title", "Unknown Title"), + "start_date": ( + start_date.strftime("%Y-%m-%d") if start_date else "Unknown Start Date" + ), + "end_date": ( + end_date.strftime("%Y-%m-%d") if end_date else "Unknown End Date" + ), + "url": ev.get("url", "Unknown URL"), + "location": location, + } + + def parse_date(self, start_date_str, end_date_str, current_year): + start_date, end_date = None, None + try: + if start_date_str: + if "," in start_date_str: + date_parts = start_date_str.split(", ") + start_date = datetime.strptime( + f"{date_parts[0]} {date_parts[1]}", "%b %d %Y" + ).date() + date_parts = end_date_str.split(", ") + end_date = datetime.strptime( + f"{date_parts[0]} {date_parts[1]}", "%b %d %Y" + ).date() + else: + start_date = datetime.strptime( + f"{start_date_str} {current_year}", "%b %d %Y" + ) + end_date = start_date + + if end_date_str and not end_date: + end_date = datetime.strptime( + f"{end_date_str} {current_year}", "%b %d %Y" + ).date() + except ValueError: + pass + + return start_date, end_date + + +if __name__ == "__main__": + devpost_scraper = DevpostSource() + + events = devpost_scraper.get_events() + + for event in events: + print(f"Event Name: {event['name']}") + print(f"Start Date: {event['start_date']}") + print(f"End Date: {event['end_date']}") + print(f"Website URL: {event['url']}") + print(f"Location: {event['location']}") + print("-----------") + print(f"Total events fetched: {len(events)}") diff --git a/core/scraping/ethglobal.py b/core/scraping/ethglobal.py new file mode 100644 index 0000000..ced4b6d --- /dev/null +++ b/core/scraping/ethglobal.py @@ -0,0 +1,96 @@ +import cloudscraper +from bs4 import BeautifulSoup +from datetime import datetime +from core.scraping.abstractdatasource import AbstractDataSource + + +class EthGlobalSource(AbstractDataSource): + URL = "https://ethglobal.com/events/hackathons" + + def scrape_page(self): + scraper = cloudscraper.create_scraper() + response = scraper.get(self.URL) + if response.status_code != 200: + return [] + + soup = BeautifulSoup(response.text, "html.parser") + event_cards = soup.find_all("a", href=True) + events = [] + + for card in event_cards: + event_url = f"https://ethglobal.com{card['href']}" + header = card.find("header") + if header: + title = header.find("h3").get_text(strip=True) + start_date, end_date = self.extract_dates(card) + location = self.extract_location(card, title) + + events.append( + { + "name": title, + "url": event_url, + "location": location, + "start_date": start_date, + "end_date": end_date, + } + ) + + return events + + def extract_dates(self, card): + dates = card.find_all("time") + if dates: + start_date_str = dates[0].get_text(strip=True).split(", ") + end_date_str = dates[1].get_text(strip=True).split(", ") + start_date = str( + self.parse_date(f"{start_date_str[0]} {start_date_str[1]}") + ) + end_date = str(self.parse_date(f"{end_date_str[0]} {end_date_str[1]}")) + else: + dates = card.find_all("div")[-1].get_text().split(", ") + start_date = ( + str(datetime.strptime(f"{dates[0]} {dates[1]}", "%b %Y").date())[:-2] + + "xx" + ) + end_date = start_date + + return start_date, end_date + + def extract_location(self, card, name): + location_div = card.find_all("div")[2] + is_virtual = location_div.get_text(strip=True) + if is_virtual == "Virtual": + return is_virtual + return " ".join(name.split()[1:]) + + def parse_event(self, ev): + start_date = ev["start_date"] + end_date = ev["end_date"] + + return { + "name": ev.get("name", "Unknown Title"), + "start_date": (start_date if start_date else "Unknown Start Date"), + "end_date": (end_date if end_date else "Unknown End Date"), + "url": ev.get("url", "Unknown URL"), + "location": ev.get("location", "Unknown Location"), + } + + def parse_date(self, date_str): + try: + return datetime.strptime(date_str, "%b %d %Y").date() + except ValueError: + return None + + +if __name__ == "__main__": + ethglobal_scraper = EthGlobalSource() + events = ethglobal_scraper.get_events() + + for event in events: + print(f"Event Name: {event['name']}") + print(f"Start Date: {event['start_date']}") + print(f"End Date: {event['end_date']}") + print(f"Website URL: {event['url']}") + print(f"Location: {event['location']}") + print("-----------") + print(f"Total events fetched: {len(events)}") diff --git a/core/scraping/hackclubsource.py b/core/scraping/hackclubsource.py new file mode 100644 index 0000000..2aed87b --- /dev/null +++ b/core/scraping/hackclubsource.py @@ -0,0 +1,100 @@ +import cloudscraper +from bs4 import BeautifulSoup +from datetime import datetime +from core.scraping.abstractdatasource import AbstractDataSource + + +class HackClubSource(AbstractDataSource): + URL = "https://hackathons.hackclub.com" + + def scrape_page(self, **kwargs): + scraper = cloudscraper.create_scraper() + response = scraper.get(self.URL) + if response.status_code != 200: + return [], 0, "invalid" + + soup = BeautifulSoup(response.text, "html.parser") + + event_cards = soup.select("main > div > div a") + events = [] + + for card in event_cards: + event_url = card["href"] + title = card.find("h3", itemprop="name").get_text(strip=True) + start_date_str = card.find("span", itemprop="startDate")["content"] + end_date_str = card.find("span", itemprop="endDate")["content"] + event_type = card.find("span").get_text(strip=True) + if event_type == "In-Person": + location = card.find("span", itemprop="address").get_text(strip=True)[ + 1: + ] + else: + location = event_type + + if not self.is_upcoming(start_date_str): + continue + + events.append( + { + "name": title, + "url": event_url, + "location": location, + "start_date": start_date_str, + "end_date": end_date_str, + } + ) + + return events + + def get_events(self, **kwargs): + events = self.scrape_page(**kwargs) + return [self.parse_event(ev, **kwargs) for ev in events] + + def parse_event(self, ev, **kwargs): + start_date_str = ev["start_date"] + end_date_str = ev["end_date"] + + start_date = self.parse_datetime(start_date_str) + end_date = self.parse_datetime(end_date_str) + + return { + "name": ev.get("name", "Unknown Title"), + "start_date": ( + start_date.strftime("%Y-%m-%d") if start_date else "Unknown Start Date" + ), + "end_date": ( + end_date.strftime("%Y-%m-%d") if end_date else "Unknown End Date" + ), + "url": ev.get("url", "Unknown URL"), + "location": ev.get("location", "Unknown Location"), + } + + def parse_datetime(self, datetime_str): + try: + return datetime.strptime(datetime_str, "%Y-%m-%dT%H:%M:%S.%fZ") + except ValueError: + return None + + def is_upcoming(self, start_date_str): + current_date = datetime.utcnow() + + start_date = self.parse_datetime(start_date_str) + if start_date is None: + return False + + return start_date > current_date + + +if __name__ == "__main__": + hackclub_scraper = HackClubSource() + + events = hackclub_scraper.get_events() + + for event in events: + print(f"Event Name: {event['name']}") + print(f"Start Date: {event['start_date']}") + print(f"End Date: {event['end_date']}") + print(f"Website URL: {event['url']}") + print(f"Location: {event['location']}") + print("-----------") + print(f"Total events fetched: {len(events)}") diff --git a/core/scraping/mlh.py b/core/scraping/mlh.py new file mode 100644 index 0000000..bddaa7e --- /dev/null +++ b/core/scraping/mlh.py @@ -0,0 +1,65 @@ +import cloudscraper +from bs4 import BeautifulSoup +import datetime +from core.scraping.abstractdatasource import AbstractDataSource + + +class MLHSource(AbstractDataSource): + URL = "https://mlh.io/seasons/{}/events" + + def scrape_page(self, **kwargs): + scraper = cloudscraper.create_scraper() + + url = self.URL.format(kwargs.get("year", datetime.datetime.now().year)) + response = scraper.get(url) + + soup = BeautifulSoup(response.text, "html.parser") + + rows = soup.find_all("div", class_="row") + events = [] + + for row in rows: + h3_tag = row.find("h3") + if h3_tag and "Past Events" in h3_tag.text: + break + + events.extend(row.find_all("a", class_="event-link")) + return events + + def parse_event(self, ev, **kwargs): + event_name = ev.find("h3", class_="event-name").text.strip() + + start_date = ev.find("meta", itemprop="startDate")["content"] + end_date = ev.find("meta", itemprop="endDate")["content"] + + event_url = ev["href"] + + location = ev.find("div", class_="event-location") + city = location.find("span", itemprop="city").text.strip() + state = location.find("span", itemprop="state").text.strip() + + event_details = { + "name": event_name, + "start_date": start_date, + "end_date": end_date, + "url": event_url, + "location": city + ", " + state, + } + + return event_details + + +if __name__ == "__main__": + mlh_scraper = MLHSource() + + events = mlh_scraper.get_events(year=2025) + + for event in events: + print(f"Event Name: {event['name']}") + print(f"Start Date: {event['start_date']}") + print(f"End Date: {event['end_date']}") + print(f"Website URL: {event['url']}") + print(f"Location: {event['location']['city']}, {event['location']['state']}") + print("-----------") + assert len(events) == 44 + print(len(events)) diff --git a/core/urls.py b/core/urls.py index 748e018..4470043 100644 --- a/core/urls.py +++ b/core/urls.py @@ -32,5 +32,6 @@ path("scrapeMlh/", views.scrapeMlh, name="scrapeMlh"), path("scrapeDevpost/", views.scrapeDevpost, name="scrapeDevpost"), path("scrapeEth/", views.scrapeEth, name="scrapeEth"), + path("scrapeHackclub/", views.scrapeHackclub, name="scrapeHackclub"), path("export_cal/", views.calendar_generator, name="calendar_generator"), ] + static(settings.MEDIA_URL, document_root=settings.MEDIA_ROOT) diff --git a/core/views.py b/core/views.py index a40d56b..84cd083 100644 --- a/core/views.py +++ b/core/views.py @@ -291,6 +291,13 @@ def scrapeEth(request): return HttpResponse("Scraped!") +@login_required +@user_passes_test(is_admin) +def scrapeHackclub(request): + scrape_all(4) + return HttpResponse("Scraped!") + + @cache_page(60 * 60) # 1 hour cache def calendar_generator(request): tdy_date = timezone.now() From 8a995a9ce81c768f56528b9464070f85bc02db2b Mon Sep 17 00:00:00 2001 From: Waheeb Barqawi Date: Fri, 3 Jan 2025 23:12:53 -0500 Subject: [PATCH 2/2] web scraping functionality --- core/scraping/mlh.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/scraping/mlh.py b/core/scraping/mlh.py index bddaa7e..ffe6163 100644 --- a/core/scraping/mlh.py +++ b/core/scraping/mlh.py @@ -52,7 +52,7 @@ def parse_event(self, ev, **kwargs): if __name__ == "__main__": mlh_scraper = MLHSource() - events = mlh_scraper.get_events(year=2025) + events = mlh_scraper.get_events() for event in events: print(f"Event Name: {event['name']}")