DC: Event scraper for new site (#4731)

openstates · Nov 27, 2023 · 77ae8b7 · 77ae8b7
1 parent 009d269
commit 77ae8b7
Showing 1 changed file with 56 additions and 55 deletions.
diff --git a/scrapers/dc/events.py b/scrapers/dc/events.py
@@ -1,70 +1,71 @@
 import lxml.html
-import dateutil.parser
 import pytz
+import re
 
+from ics import Calendar
 from openstates.scrape import Scraper, Event
+from utils.media import get_media_type
 
 
 class DCEventScraper(Scraper):
     _tz = pytz.timezone("US/Eastern")
 
-    def scrape(self):
-        url = "https://dccouncil.gov/events/list/"
-
-        yield from self.scrape_cal_page(url)
-
-    def scrape_cal_page(self, url):
-        page = self.get(url).content
-        page = lxml.html.fromstring(page)
-        page.make_links_absolute(url)
-
-        for row in page.xpath("//article[contains(@class,'accordion')]"):
-            when = row.xpath(".//time/@datetime")[0]
-            when = dateutil.parser.parse(when)
+    bill_prefixes = {"bill": "B", "resolution": "R"}
 
-            title = row.xpath(".//h3[contains(@class,'heading-link')]/text()")[
-                0
-            ].strip()
-
-            description = row.xpath(
-                "section/div[contains(@class,'large-8')]/div[contains(@class,'base')]"
-            )[0].text_content()
-
-            # fix special chars
-            description = (
-                description.replace("\n\u2013", " ")
-                .replace("\n", " ")
-                .replace("\u203a", "")
-            )
-            description = description.replace("More about this event", "").strip()
-
-            location = row.xpath(
-                "header/div/div[contains(@class,'large-8')]/div/div[contains(@class,'text-right')]/p"
-            )[0].text_content()
-
-            event = Event(
-                name=title,
-                description=description,
-                start_date=when,
-                location_name=location,
-            )
-
-            agenda_url = row.xpath(
-                ".//a[contains(text(),'More about this event')]/@href"
+    def scrape(self):
+        # use ical to get the full feed and start dates, which aren't cleanly in the html
+        ical_url = (
+            "https://dccouncil.gov/?post_type=tribe_events&ical=1&eventDisplay=list"
+        )
+
+        ical = self.get(ical_url).text
+        self.info("Parsing event feed. This may take a moment.")
+        cal = Calendar(ical)
+        for e in cal.events:
+            yield from self.scrape_cal_page(e)
+
+    def scrape_cal_page(self, e):
+        # scrape the html to get the correct links and description
+        page = lxml.html.fromstring(self.get(e.url).content)
+
+        title = e.name
+        start = str(e.begin)
+        location = e.location
+        description = str(e.description)
+
+        event = Event(
+            title,
+            start,
+            location,
+            description=description,
+            end_date=str(e.end),
+        )
+
+        bill_regex = r"(?P<type>Bill|Resolution) (?P<session>\d+)-(?P<billnumber>\d+)"
+        matches = re.findall(bill_regex, description, flags=re.IGNORECASE)
+
+        for match in matches:
+            bill = (
+                f"{self.bill_prefixes[match[0].lower()]} {match[1]}-{match[2].zfill(4)}"
             )
-            if agenda_url != []:
-                event.add_document(
-                    "Details and Agenda", agenda_url[0], media_type="text/html"
-                )
-
-            if "committee meeting" in title.lower():
-                com_name = title.replace("Committee Meeting", "").strip()
+            event.add_bill(bill)
+
+        header = page.xpath("//header[contains(@class,'article-header')]/p[1]/text()")[
+            0
+        ]
+        if "&bullet;" in header:
+            com_name = header.split("&bullet;")[1].strip()
+            if "whole" not in com_name.lower():
                 event.add_participant(com_name, type="committee", note="host")
 
-            event.add_source(url)
+        materials = page.xpath(
+            "//section[contains(@class,'aside-section')]//a[contains(@class,'icon-link')]"
+        )
+        for mat in materials:
+            title = mat.xpath("text()")[0].strip()
+            url = mat.xpath("@href")[0]
+            event.add_document(title, url, media_type=get_media_type(url))
 
-            yield event
+        event.add_source(e.url)
 
-        if page.xpath("//a[contains(text(), 'Upcoming Events')]"):
-            next_url = page.xpath("//a[contains(text(), 'Upcoming Events')]/@href")[0]
-            yield from self.scrape_cal_page(next_url)
+        yield event