Skip to content

Commit

Permalink
VA: event scraper overhaul (#5251)
Browse files Browse the repository at this point in the history
  • Loading branch information
showerst authored Jan 24, 2025
1 parent 675c42b commit 8cef4bd
Showing 1 changed file with 148 additions and 1 deletion.
149 changes: 148 additions & 1 deletion scrapers/va/events.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,136 @@
from openstates.scrape import Scraper, Event
from utils.media import get_media_type
import datetime
import dateutil
import json
import lxml
import pytz
import re


simple_html_tag_regex = re.compile("<.*?>")


class VaEventScraper(Scraper):
_tz = pytz.timezone("America/New_York")

bill_regex = r"([shbrj]+\s*\d+)"

def choose_agenda_parser(self, event: Event, url: str) -> None:
if "lis.virginia" in url.lower():
self.scrape_senate_agenda(event, url)
elif "virginiageneralassembly" in url.lower():
self.scrape_house_com_agendas(event, url)
elif "sfac.virginia.gov" in url.lower():
self.scrape_senate_fac_agendas(event, url)
else:
self.error(f"Found VA agenda link with no parser {url}")

# instead of linking directly to their agendas,
# individual events link to committee pages that link to multiple meeting agendas
# so loop through that table, comparing the dates and scrape the matching one(s)
def scrape_house_com_agendas(self, event: Event, url: str) -> None:
page = self.get(url).content
page = lxml.html.fromstring(page)
page.make_links_absolute(url)

for row in page.cssselect("div.agendaContainer tbody tr"):
link = row.xpath("td[1]/a")[0]
when = dateutil.parser.parse(link.text_content()).date()
if when == event.start_date.date():
self.scrape_house_com_agenda(event, link.xpath("@href")[0])
event.add_document(
"Agenda",
link.xpath("@href")[0],
media_type="text/html",
on_duplicate="ignore",
)

def scrape_house_com_agenda(self, event: Event, url: str) -> None:
# https://virginiageneralassembly.gov/house/agendas/agendaItemExport.php?id=4790&ses=251
page = self.get(url).content
page = lxml.html.fromstring(page)
page.make_links_absolute(url)

for row in page.xpath("//table[contains(@summary, 'Agenda')]/tbody/tr[td[3]]"):
agenda_item = event.add_agenda_item(row.xpath("td[3]")[0].text_content())
agenda_item.add_bill(row.xpath("td[1]/a")[0].text_content())

# individual senate events link to a page that makes a JSON api request
# to build the page dynamically, so parse that output
def scrape_senate_agenda(self, event: Event, url: str) -> None:
docket_id = re.findall(r"dockets\/(\d+)|$", url)[0]

if docket_id:
url = f"https://lis.virginia.gov/Calendar/api/GetDocketsByIdAsync?docketId={docket_id}"
headers = {
"Referer": url,
"webapikey": "FCE351B6-9BD8-46E0-B18F-5572F4CCA5B9",
"User-Agent": "openstates.org",
}
page = self.get(url, headers=headers).json()
for row in page["Dockets"][0]["DocketCategories"][0]["DocketItems"]:
agenda_item = event.add_agenda_item(row["LegislationDescription"])
if row["LegislationNumber"]:
agenda_item.add_bill(row["LegislationNumber"])
else:
self.warning(f"No Docket ID found in {url}")

# finance and approps has its own website
def scrape_senate_fac_agendas(self, event: Event, url: str) -> None:
page = self.get(url).content
page = lxml.html.fromstring(page)
page.make_links_absolute(url)

for row in page.xpath(
"//table[@id='meetings']/tbody/tr[td and not(contains(@class,'materials'))]"
):
if row.text_content().strip() == "":
continue

when = row.xpath("td[1]")[0].text_content()
# fix for 01/14/ 2025
when = when.replace(" ", "")
when = dateutil.parser.parse(when).date()

if when != event.start_date.date():
continue

for link in row.xpath("td[4]/a"):
event.add_document(
link.text_content(),
link.xpath("@href")[0],
media_type=get_media_type(
link.xpath("@href")[0], default="text/html"
),
on_duplicate="ignore",
)

for item in row.xpath("./following-sibling::tr[1]/td/p"):
item_text = item.text_content().strip()
if (
item_text == ""
or item_text == "Materials"
or item_text == "Materials:"
):
continue
agenda_item = event.add_agenda_item(item_text)
for item_link in item.xpath("a"):
# most of the link text is just "(Presentation)"
# so use the whole item
event.add_document(
item_text,
item_link.xpath("@href")[0],
media_type=get_media_type(
item_link.xpath("@href")[0], default="text/html"
),
on_duplicate="ignore",
)

for match in re.findall(
self.bill_regex, item_text, flags=re.IGNORECASE
):
agenda_item.add_bill(match)

def scrape(self, start_date=None):
# TODO: what's the deal with this WebAPIKey, will it expire?
headers = {
Expand Down Expand Up @@ -73,6 +192,34 @@ def scrape(self, start_date=None):
)
event.add_source("https://lis.virginia.gov/schedule")

for match in re.findall(self.bill_regex, name, flags=re.IGNORECASE):
event.add_bill(match)

for match in re.findall(self.bill_regex, desc, flags=re.IGNORECASE):
event.add_bill(match)

if row["Description"]:
html_desc = lxml.html.fromstring(desc)

for link in html_desc.xpath("//a[contains(text(),'Agenda')]"):
docket_url = link.xpath("@href")[0]
event.add_document(
link.text_content(),
link.xpath("@href")[0],
media_type="text/html",
on_duplicate="ignore",
)
self.choose_agenda_parser(event, docket_url)

if "LinkURL" in row and row["LinkURL"]:
event.add_document(
"Docket Info",
row["LinkURL"],
media_type="text/html",
on_duplicate="ignore",
)
self.choose_agenda_parser(event, row["LinkURL"])

for ct, attach in enumerate(row["ScheduleFiles"]):
if ct == 0:
event.add_document(
Expand Down

0 comments on commit 8cef4bd

Please sign in to comment.