Skip to content

Commit

Permalink
CA: events web rewrite WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
jessemortenson committed Jan 21, 2025
1 parent e89aeb0 commit 435dbf7
Showing 1 changed file with 9 additions and 8 deletions.
17 changes: 9 additions & 8 deletions scrapers/ca/events_web.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@

class CAEventWebScraper(Scraper, LXMLMixin):
_tz = pytz.timezone("US/Pacific")
date_format = "%m-%d-%Y"
# date_format = "%m-%d-%Y"
date_format = "%Y-%m-%d"

def scrape(self, chamber=None, start=None, end=None):
if start is None:
Expand Down Expand Up @@ -44,20 +45,20 @@ def scrape_upper(self, start, end):
# https://www.senate.ca.gov/calendar?startdate=01-17-2024&
# enddate=01-24-2024&committee=&committee-hearings=on
# senate website needs start_date and end_date
# set it to a week
upper_start_url = f"https://www.senate.ca.gov/calendar?startdate={start}&enddate={end}&committee=&committee-hearings=on"
upper_start_url = f"https://www.senate.ca.gov/calendar?startDate={start}&endDate={end}&committeeHearings=1"
html = requests.get(upper_start_url).text
page = lxml.html.fromstring(html)

for date_row in page.xpath('//div[contains(@class, "calendarDayContainer")]'):
hearing_date = date_row.xpath('.//div[@class="calendarDate"]/text()')[
# TODO: started to "fix" selectors here, but didn't get very far
for date_row in page.xpath('//div[contains(@class, "day-wrapper")]'):
hearing_date = date_row.xpath('.//h2[@class="date"]/text()')[
0
].strip()
for committee_row in date_row.xpath(
'.//div[@class="eventContainer"][1]/div[@class="panel panel-default"]'
'.//div[contains(@class, "committee-hearings")]/div[@class="page-events__item--committee-hearing"]'
):
hearing_title = committee_row.xpath(
'.//div[@class="panel-heading"]//strong'
'.//h4[@class="page-events__title"]'
)[0].xpath("string()")
panel_content = committee_row.xpath('.//div[@class="panel-content"]')[
0
Expand Down Expand Up @@ -182,7 +183,7 @@ def scrape_upper_agenda(self, event, committees, url):

def scrape_lower(self):
lower_start_url = (
"https://www.assembly.ca.gov/schedules-publications/assembly-daily-file"
"c"
)
html = requests.get(lower_start_url).text
page = lxml.html.fromstring(html)
Expand Down

0 comments on commit 435dbf7

Please sign in to comment.