-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathupdate_fares.R
76 lines (49 loc) · 2.84 KB
/
update_fares.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
library("tidyverse")
library("httr")
library("tidyr")
library("rvest")
library("dplyr")
#the original fare table scraped everything from 2010 to Septemeber 30, 2017 or 381 weeks worth of fares. To update the table, run the following.
#set the config to get around self-sign error
httr::set_config( config( ssl_verifypeer = 0L ) )
#set the URL to the main page that lists links to fare data by week
URL<-"http://web.mta.info/developers/fare.html"
#get the list of links to each weeks data
faresite<-httr::content(GET(URL))
#get the text of each link, which is a day-date
faredate<-faresite %>%
html_nodes(".last a") %>%
html_text()
#extract the URL for each link
farelink<-faresite %>%
html_nodes(".last a") %>%
html_attr("href")
#bind the two lists together to create a table with the date and link
faretbl<-as.data.frame(cbind(date=faredate, link=farelink), stringsAsFactors = FALSE)
#format the date to remove the weekday info
faretbl$date<-lubridate::mdy(faretbl$date)
#each link is relative, paste the suffix to create a full absolute path
faretbl$link<-paste0("http://web.mta.info/developers/", faretbl$link)
oldfares<-read.csv("E:/d4d/fare_history.csv", stringsAsFactors = FALSE)
max_date<-max(oldfares$date)
faretbl<-faretbl %>% filter(date>max_date)
fares<-lapply(faretbl$link[1:nrow(faretbl)], read.csv, stringsAsFactors=FALSE, skip=2)
#after looking at the data, there were some inconsistencies due to new fare types being introduced over the past year. There are a total of 25 fare types, but many of them either had a column total of zero or are fare types that aren't relevant to our analysis (e.g., mta employee fares, AirTrain, CUNY student special fare, etc.) I kept the full fare, senior discount, AFAS (ADA) and the weekly and monthly card data
#use purrr to get just the first 12 columns
fares<-map(fares, `[`, c("STATION","FF","SEN.DIS","X7.D.AFAS.UNL","X30.D.AFAS.RMF.UNL", "JOINT.RR.TKT", "X7.D.UNL", "X30.D.UNL", "TCMC"))
fares<-bind_rows(fares, .id="group")
names(fares)<-tolower(names(fares))
fares$wktot<-rowSums(fares[,3:10])
#remove depot and select bus service names
fares<-fares %>% filter(!grepl("DEPOT", station) & !grepl("@", station) & station !="MTABC - EASTCHESTER 2")
#create a table with seq_along data and the date to join the date back in after binding the list elements together
fare_group_for_join<-data.frame(date=as.Date(faretbl$date), group=as.character(seq_along(faretbl$date)))
#add dates back to the df
fares<-left_join(fares, fare_group_for_join, by="group")
adanames<-names(fares)[grepl("afas", names(fares))]
ada<-fares %>% select(station, date, adanames)
#bind the new rows to the old table
fares<-rbind(fares, oldfares)
#make sure doubling didn't occur
fares<-fares %>% distinct()
write.csv(fares, "fare_history.csv", row.names=FALSE)