Factors in R

Load needed packages

pacman::p_load(forcats,ggplot2,tidyverse,lubridate)

Reorder Levels

Goal forcats function
Set order manually fct_relevel(f, 'b', 'a','c')
Set order based on another vector fct_reorder(f, x)
Set order based on which category is most frequent fct_infreq(f)
Set order based on when they first appear fct_inorder(f)
Reverse factor order fct_rev(f)
Rotate order left or right fct_shift(f, steps)

fct_relevel()

  • Relevel to the top
f <- factor(letters[1:4])
levels(f)
## [1] "a" "b" "c" "d"
fct_relevel(f, "b", "c")
## [1] a b c d
## Levels: b c a d
  • Relevel to the end
fct_relevel(f, "a", after = Inf)
## [1] a b c d
## Levels: b c d a
  • Relevel with a function
fct_relevel(f, sort)
## [1] a b c d
## Levels: a b c d
fct_relevel(f, rev)
## [1] a b c d
## Levels: d c b a
  • fct_inorder() Reorder by first appearance
f <- factor(c("b", "b", "a", "c", "c", "c"))
fct_inorder(f)
## [1] b b a c c c
## Levels: b a c
  • fct_infreq() reorder by frequency
fct_infreq(f)
## [1] b b a c c c
## Levels: c b a
  • fct_inseq() reorder by numeric order
f <- factor(1:3, levels = 3:1)
fct_inseq(f)
## [1] 1 2 3
## Levels: 1 2 3

fct_reorder

ggplot(iris, aes(fct_reorder(Species, Sepal.Width), Sepal.Width)) +
  geom_boxplot()

fct_reorder2

  • fct_reorder2(factor,x,y) reorders the factor by the y values associated with the largest x values.
  • This makes the plot easier to read because the line colours line up with the legend.
  • Noticed the legend order aligns with the line plot sequence at its endpoint.
by_age <- gss_cat %>%
  filter(!is.na(age)) %>%
  count(age, marital) %>%
  group_by(age) %>%
  mutate(prop = n / sum(n))
ggplot(by_age, aes(age, prop, colour = fct_reorder2(marital, age, prop))) +
  geom_line() +
  labs(colour = "marital")

fct_rev()

f <- factor(letters[1:4],levels=letters[c(3:4,1:2)])
levels(f)
## [1] "c" "d" "a" "b"
f%>%fct_rev()
## [1] a b c d
## Levels: b a d c

fct_shift()

x<-wday(seq(ymd("2024/1/1"), ymd("2024/1/7"), by = "1 day"), label = TRUE, week_start = 1)
x
## [1] Mon Tue Wed Thu Fri Sat Sun
## Levels: Mon < Tue < Wed < Thu < Fri < Sat < Sun
fct_shift(x,-1)
## [1] Mon Tue Wed Thu Fri Sat Sun
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat

Edit Factor Labels

Goal forcats function
Manually change the label(s) fct_recode(f, new_label = "old_label")
Systematically change all labels fct_relabel(f, function)

fct_recode

x <- factor(c("apple", "bear", "banana", "dear"))
fct_recode(x, fruit = "apple", fruit = "banana")
## [1] fruit bear  fruit dear 
## Levels: fruit bear dear

fct_relabel

iris$Species%>%fct_relabel(stringr::str_to_upper)%>%table()
## .
##     SETOSA VERSICOLOR  VIRGINICA 
##         50         50         50

Collapse or lump Levels

fct_collapse

x<-wday(seq(ymd("2024/1/1"), ymd("2024/1/7"), by = "1 day"), label = TRUE, week_start = 1)
x
## [1] Mon Tue Wed Thu Fri Sat Sun
## Levels: Mon < Tue < Wed < Thu < Fri < Sat < Sun
lvl=levels(x)
fct_collapse(x,weekdays=lvl[1:5],weekends=lvl[6:7])
## [1] weekdays weekdays weekdays weekdays weekdays weekends weekends
## Levels: weekdays < weekends

fct_lump

  • fct_lump_min(): Lumps levels that appear fewer than min times.
x <- factor(rep(LETTERS[1:9], times = c(40, 10, 5, 27, 1, 1, 1, 1, 1)))
x %>%
  fct_lump_min(5) %>%
  table()
## .
##     A     B     C     D Other 
##    40    10     5    27     5
  • fct_lump_prop(): Lumps levels that appear in fewer than (or equal to) prop * n times.
x %>%
  fct_lump_prop(0.10) %>%
  table()
## .
##     A     B     D Other 
##    40    10    27    10
  • fct_lump_n(): Lumps all levels except for the n most frequent (or least frequent if n < 0).
x %>%
  fct_lump_n(3) %>%
  table()
## .
##     A     B     D Other 
##    40    10    27    10
  • fct_lump_lowfreq(): Lumps together the least frequent levels, ensuring that “other” is still the smallest.
x %>%
  fct_lump_lowfreq() %>%
  table()
## .
##     A     D Other 
##    40    27    20

fct_other()

  • fct_other(): Manually replace levels with “other”
x%>%fct_other(keep = c("A", "B"))%>%table()
## .
##     A     B Other 
##    40    10    37
x%>%fct_other(drop = c("A", "B"))%>%table()
## .
##     C     D     E     F     G     H     I Other 
##     5    27     1     1     1     1     1    50

Add or Subtract Levels

fct_expand

f <- factor(letters[1:3])
fct_expand(f, "d", "e", "f")
## [1] a b c
## Levels: a b c d e f

fct_drop

f <- factor(c("a", "b"), levels = c("a", "b", "c"))
f%>%fct_drop()
## [1] a b
## Levels: a b
副教授

我的研究聚焦在机器学习和统计学的交叉领域,专注于分析高维数据的方法和算法。