library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.2 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(haven)
gender <- readRDS("data/gender_wage_data_small.rds")
head(gender)
## # A tibble: 6 × 15
## YEAR SAMPLE SERIAL CBSERIAL HHWT CLUSTER GQ PERNUM PERWT SEX
## <int> <int+lbl> <dbl> <dbl> <dbl> <dbl> <int+l> <dbl> <dbl> <int+l>
## 1 2024 202401 [2024… 9.06e5 2.02e12 9 2.02e12 3 [Gro… 1 9 2 [Fem…
## 2 2024 202401 [2024… 1.49e6 2.02e12 51 2.02e12 1 [Hou… 2 97 1 [Mal…
## 3 2024 202401 [2024… 6.15e4 2.02e12 111 2.02e12 1 [Hou… 2 55 1 [Mal…
## 4 2024 202401 [2024… 1.46e6 2.02e12 68 2.02e12 1 [Hou… 2 76 1 [Mal…
## 5 2024 202401 [2024… 1.25e6 2.02e12 99 2.02e12 1 [Hou… 2 101 2 [Fem…
## 6 2024 202401 [2024… 1.51e6 2.02e12 23 2.02e12 1 [Hou… 1 24 1 [Mal…
## # ℹ 5 more variables: AGE <int+lbl>, EDUC <int+lbl>, EDUCD <int+lbl>,
## # UHRSWORK <int+lbl>, INCWAGE <dbl+lbl>
str("gender_wage_data_small.rds")
## chr "gender_wage_data_small.rds"
Comparing raw wages by gender
library(tidyverse)
library(haven)
gender <- gender |>
mutate(
INCWAGE = as.numeric(zap_labels(INCWAGE))
)
gender |>
group_by(SEX) |>
summarize(
median_wage = median(as.numeric(INCWAGE), na.rm = TRUE)
)
## # A tibble: 2 × 2
## SEX median_wage
## <int+lbl> <dbl>
## 1 1 [Male] 40000
## 2 2 [Female] 18100
library(haven)
library(dplyr)
gender <- gender |>
mutate(
SEX = as.numeric(zap_labels(SEX)),
INCWAGE = as.numeric(zap_labels(INCWAGE)),
SEX = factor(SEX, levels = c(1, 2), labels = c("Male", "Female"))
)
gender_summary <- gender |>
group_by(SEX) |>
summarize(
median_wage = median(INCWAGE, na.rm = TRUE)
)
Creating a visulaization
ggplot(gender_summary, aes(x = SEX, y = median_wage, fill = SEX)) +
geom_col() +
scale_fill_manual(values = c("Male" = "#4C72B0", "Female" = "#DD8452"))
