From 5928e50ca091c2970a30927593cc6dea904a15d7 Mon Sep 17 00:00:00 2001 From: Jesse Mortenson Date: Fri, 24 Jan 2025 14:29:00 -0600 Subject: [PATCH 1/2] VA: bugfix for missing action description --- scrapers/utils/actions.py | 13 ++++++++----- scrapers/va/bills.py | 25 +++++++++++++++++-------- 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/scrapers/utils/actions.py b/scrapers/utils/actions.py index ea9c5251e9..0163ddc3ca 100644 --- a/scrapers/utils/actions.py +++ b/scrapers/utils/actions.py @@ -46,11 +46,14 @@ def match(self, text): matched = False for regex in self.regexes: - m = regex.search(text) - if m: - matched = True - # add any matched attrs - attrs.update(m.groupdict()) + try: + m = regex.search(text) + if m: + matched = True + # add any matched attrs + attrs.update(m.groupdict()) + except TypeError: + continue if matched: return attrs diff --git a/scrapers/va/bills.py b/scrapers/va/bills.py index f34c9f8087..1e602607d2 100644 --- a/scrapers/va/bills.py +++ b/scrapers/va/bills.py @@ -121,15 +121,24 @@ def add_actions(self, bill: Bill, legislation_id: str): for row in page["LegislationEvents"]: when = dateutil.parser.parse(row["EventDate"]).date() - action_attr = self.categorizer.categorize(row["Description"]) - classification = action_attr["classification"] + description = row["Description"] + if not description and row["VoteTally"]: + description = f"Vote {row['VoteTally']}" - bill.add_action( - chamber=self.chamber_map[row["ChamberCode"]], - description=row["Description"], - date=when, - classification=classification, - ) + if description: + action_attr = self.categorizer.categorize(description) + classification = action_attr["classification"] + + bill.add_action( + chamber=self.chamber_map[row["ChamberCode"]], + description=description, + date=when, + classification=classification, + ) + else: + self.logger.warning( + f"Could not add action due to missing description for {bill.identifier} LegislataionEventID {row['LegislationEventID']}" + ) # map reference numbers back to their actions for impact filenames # HB9F122.PDF > { 'HB9F122' => "Impact statement from DPB (HB9)" } From 62a7dc493c714e24ac084f9c2975e67abc9d23f0 Mon Sep 17 00:00:00 2001 From: Jesse Mortenson Date: Fri, 24 Jan 2025 14:31:39 -0600 Subject: [PATCH 2/2] Backoff on swallowing errors in action classifier --- scrapers/utils/actions.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/scrapers/utils/actions.py b/scrapers/utils/actions.py index 0163ddc3ca..ea9c5251e9 100644 --- a/scrapers/utils/actions.py +++ b/scrapers/utils/actions.py @@ -46,14 +46,11 @@ def match(self, text): matched = False for regex in self.regexes: - try: - m = regex.search(text) - if m: - matched = True - # add any matched attrs - attrs.update(m.groupdict()) - except TypeError: - continue + m = regex.search(text) + if m: + matched = True + # add any matched attrs + attrs.update(m.groupdict()) if matched: return attrs