Skip to content

kwlyu/stat230-f24-final_project

Repository files navigation

Data obtained from GSS Data Explorer. The original data set can be accessed within this repository. A cleaned version is also in here.

Data Wrangling

happiness_raw <- read_csv("GSS_commute_happiness.csv")

happiness_cleaned <- happiness_raw %>%
  select(year, happy, commute, realrinc, educ, race, gender1) %>% 
  filter(commute != ".i:  Inapplicable",
         realrinc > 0,
         happy != ".n:  No answer") %>% 
  mutate(
    educ = case_when(
      str_detect(educ, "grade") ~ as.numeric(str_extract(educ, "\\d+")),
      str_detect(educ, "college") ~ as.numeric(str_extract(educ, "\\d+")) + 12,
      str_detect(educ, "No formal schooling") ~ 0,
      TRUE ~ NA
    ),
    commute = if_else(str_detect(commute, "\\d+"), 
                      as.numeric(str_extract(commute, "\\d+")), NA),
    race = if_else(race == "White", "White", "Non White"),
    gender = if_else(gender1 == "MALE", "Male", "Female")
  ) %>% 
  select(-gender1)

happiness_recode <- happiness_cleaned %>% 
  mutate(happy = if_else(happy == "Not too happy", 0, 1)) %>% 
  drop_na() 

write.csv(happiness_recode, file = "happiness_recode.csv")

EDA

ggpairs(happiness_cleaned)

# Density plot for happiness by commute time
ggplot(happiness_cleaned, aes(x = happy, y = commute, fill = happy)) +
  geom_boxplot() +
  labs(title = "Commute Time Distribution by Happiness Level", 
       x = "Commute Time (minutes)", 
       fill = "Happiness Level")

# Boxplot of income by happiness level
ggplot(happiness_cleaned, aes(x = happy, y = realrinc, fill = happy)) +
  geom_boxplot() +
  labs(title = "Income Distribution by Happiness Level", 
       x = "Happiness Level", 
       y = "Real Income",
       fill = "Happiness Level")

# Bar plot of happiness level by education level
ggplot(happiness_cleaned, aes(x = educ, fill = happy)) +
  geom_bar(position = "fill") +
  labs(title = "Proportion of Happiness Levels by Education Level", 
       x = "Education Level", 
       y = "Proportion",
       fill = "Happiness Level") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Faceted bar plot for happiness levels by race and gender
ggplot(happiness_cleaned, aes(x = gender, fill = happy)) +
  geom_bar(position = "fill") +
  facet_wrap(~ race) +
  labs(title = "Proportion of Happiness by Race and Gender", 
       x = "Gender", 
       y = "Proportion",
       fill = "Happiness Level")

Logistic Regression

happiness_glm <- glm(happy ~ commute + realrinc + educ + race + gender, 
                     data = happiness_recode, family = quasibinomial)

summary(happiness_glm)
## 
## Call:
## glm(formula = happy ~ commute + realrinc + educ + race + gender, 
##     family = quasibinomial, data = happiness_recode)
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  1.221e+00  6.479e-01   1.884  0.05987 . 
## commute     -1.330e-03  7.569e-03  -0.176  0.86061   
## realrinc     3.730e-05  1.188e-05   3.140  0.00174 **
## educ        -2.136e-04  4.661e-02  -0.005  0.99635   
## raceWhite    1.111e-02  3.345e-01   0.033  0.97352   
## genderMale   7.433e-01  2.526e-01   2.943  0.00334 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasibinomial family taken to be 1.013288)
## 
##     Null deviance: 530.41  on 873  degrees of freedom
## Residual deviance: 503.81  on 868  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 6

About

No description, website, or topics provided.

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

Contributors 3

  •  
  •  
  •