CA: events web rewrite WIP

openstates · Jan 21, 2025 · 435dbf7 · 435dbf7
1 parent e89aeb0
commit 435dbf7
Showing 1 changed file with 9 additions and 8 deletions.
diff --git a/scrapers/ca/events_web.py b/scrapers/ca/events_web.py
@@ -13,7 +13,8 @@
 
 class CAEventWebScraper(Scraper, LXMLMixin):
     _tz = pytz.timezone("US/Pacific")
-    date_format = "%m-%d-%Y"
+    # date_format = "%m-%d-%Y"
+    date_format = "%Y-%m-%d"
 
     def scrape(self, chamber=None, start=None, end=None):
         if start is None:
@@ -44,20 +45,20 @@ def scrape_upper(self, start, end):
         # https://www.senate.ca.gov/calendar?startdate=01-17-2024&
         # enddate=01-24-2024&committee=&committee-hearings=on
         # senate website needs start_date and end_date
-        # set it to a week
-        upper_start_url = f"https://www.senate.ca.gov/calendar?startdate={start}&enddate={end}&committee=&committee-hearings=on"
+        upper_start_url = f"https://www.senate.ca.gov/calendar?startDate={start}&endDate={end}&committeeHearings=1"
         html = requests.get(upper_start_url).text
         page = lxml.html.fromstring(html)
 
-        for date_row in page.xpath('//div[contains(@class, "calendarDayContainer")]'):
-            hearing_date = date_row.xpath('.//div[@class="calendarDate"]/text()')[
+        # TODO: started to "fix" selectors here, but didn't get very far
+        for date_row in page.xpath('//div[contains(@class, "day-wrapper")]'):
+            hearing_date = date_row.xpath('.//h2[@class="date"]/text()')[
                 0
             ].strip()
             for committee_row in date_row.xpath(
-                './/div[@class="eventContainer"][1]/div[@class="panel panel-default"]'
+                './/div[contains(@class, "committee-hearings")]/div[@class="page-events__item--committee-hearing"]'
             ):
                 hearing_title = committee_row.xpath(
-                    './/div[@class="panel-heading"]//strong'
+                    './/h4[@class="page-events__title"]'
                 )[0].xpath("string()")
                 panel_content = committee_row.xpath('.//div[@class="panel-content"]')[
                     0
@@ -182,7 +183,7 @@ def scrape_upper_agenda(self, event, committees, url):
 
     def scrape_lower(self):
         lower_start_url = (
-            "https://www.assembly.ca.gov/schedules-publications/assembly-daily-file"
+            "c"
         )
         html = requests.get(lower_start_url).text
         page = lxml.html.fromstring(html)