condensing data

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(haven)
gender <- readRDS("data/gender_wage_data_small.rds")
head(gender)

## # A tibble: 6 × 15
##    YEAR SAMPLE        SERIAL CBSERIAL  HHWT CLUSTER GQ      PERNUM PERWT SEX    
##   <int> <int+lbl>      <dbl>    <dbl> <dbl>   <dbl> <int+l>  <dbl> <dbl> <int+l>
## 1  2024 202401 [2024… 9.06e5  2.02e12     9 2.02e12 3 [Gro…      1     9 2 [Fem…
## 2  2024 202401 [2024… 1.49e6  2.02e12    51 2.02e12 1 [Hou…      2    97 1 [Mal…
## 3  2024 202401 [2024… 6.15e4  2.02e12   111 2.02e12 1 [Hou…      2    55 1 [Mal…
## 4  2024 202401 [2024… 1.46e6  2.02e12    68 2.02e12 1 [Hou…      2    76 1 [Mal…
## 5  2024 202401 [2024… 1.25e6  2.02e12    99 2.02e12 1 [Hou…      2   101 2 [Fem…
## 6  2024 202401 [2024… 1.51e6  2.02e12    23 2.02e12 1 [Hou…      1    24 1 [Mal…
## # ℹ 5 more variables: AGE <int+lbl>, EDUC <int+lbl>, EDUCD <int+lbl>,
## #   UHRSWORK <int+lbl>, INCWAGE <dbl+lbl>

str("gender_wage_data_small.rds")

##  chr "gender_wage_data_small.rds"

Comparing raw wages by gender

library(tidyverse)
library(haven)

gender <- gender |>
  mutate(
    INCWAGE = as.numeric(zap_labels(INCWAGE))
  )
gender |> 
  group_by(SEX) |> 
  summarize(
    median_wage = median(as.numeric(INCWAGE), na.rm = TRUE)
  )

## # A tibble: 2 × 2
##   SEX        median_wage
##   <int+lbl>        <dbl>
## 1 1 [Male]         40000
## 2 2 [Female]       18100

library(haven)
library(dplyr)

gender <- gender |>
  mutate(
    SEX = as.numeric(zap_labels(SEX)),
    INCWAGE = as.numeric(zap_labels(INCWAGE)),
    SEX = factor(SEX, levels = c(1, 2), labels = c("Male", "Female"))
  )
gender_summary <- gender |>
  group_by(SEX) |>
  summarize(
    median_wage = median(INCWAGE, na.rm = TRUE)
  )

Creating a visulaization

ggplot(gender_summary, aes(x = SEX, y = median_wage, fill = SEX)) +
  geom_col() +
  scale_fill_manual(values = c("Male" = "#4C72B0", "Female" = "#DD8452"))

condensing data

Clara Yoder

2026-02-27

Comparing raw wages by gender

Creating a visulaization