Factors in R

最近更新于 Apr 18, 2024 4 分钟阅读时长 R

Load needed packages

pacman::p_load(forcats,ggplot2,tidyverse,lubridate)

Reorder Levels

Goal	forcats function
Set order manually	`fct_relevel(f, 'b', 'a','c')`
Set order based on another vector	`fct_reorder(f, x)`
Set order based on which category is most frequent	`fct_infreq(f)`
Set order based on when they first appear	`fct_inorder(f)`
Reverse factor order	`fct_rev(f)`
Rotate order left or right	`fct_shift(f, steps)`

`fct_relevel()`

Relevel to the top

f <- factor(letters[1:4])
levels(f)
## [1] "a" "b" "c" "d"
fct_relevel(f, "b", "c")
## [1] a b c d
## Levels: b c a d

Relevel to the end

fct_relevel(f, "a", after = Inf)
## [1] a b c d
## Levels: b c d a

Relevel with a function

fct_relevel(f, sort)
## [1] a b c d
## Levels: a b c d
fct_relevel(f, rev)
## [1] a b c d
## Levels: d c b a

fct_inorder() Reorder by first appearance

f <- factor(c("b", "b", "a", "c", "c", "c"))
fct_inorder(f)
## [1] b b a c c c
## Levels: b a c

fct_infreq() reorder by frequency

fct_infreq(f)
## [1] b b a c c c
## Levels: c b a

fct_inseq() reorder by numeric order

f <- factor(1:3, levels = 3:1)
fct_inseq(f)
## [1] 1 2 3
## Levels: 1 2 3

`fct_reorder`

ggplot(iris, aes(fct_reorder(Species, Sepal.Width), Sepal.Width)) +
  geom_boxplot()

`fct_reorder2`

fct_reorder2(factor,x,y) reorders the factor by the y values associated with the largest x values.
This makes the plot easier to read because the line colours line up with the legend.
Noticed the legend order aligns with the line plot sequence at its endpoint.

by_age <- gss_cat %>%
  filter(!is.na(age)) %>%
  count(age, marital) %>%
  group_by(age) %>%
  mutate(prop = n / sum(n))
ggplot(by_age, aes(age, prop, colour = fct_reorder2(marital, age, prop))) +
  geom_line() +
  labs(colour = "marital")

`fct_rev()`

f <- factor(letters[1:4],levels=letters[c(3:4,1:2)])
levels(f)
## [1] "c" "d" "a" "b"
f%>%fct_rev()
## [1] a b c d
## Levels: b a d c

`fct_shift()`

x<-wday(seq(ymd("2024/1/1"), ymd("2024/1/7"), by = "1 day"), label = TRUE, week_start = 1)
x
## [1] Mon Tue Wed Thu Fri Sat Sun
## Levels: Mon < Tue < Wed < Thu < Fri < Sat < Sun
fct_shift(x,-1)
## [1] Mon Tue Wed Thu Fri Sat Sun
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat

Edit Factor Labels

Goal	`forcats` function
Manually change the label(s)	`fct_recode(f, new_label = "old_label")`
Systematically change all labels	`fct_relabel(f, function)`

`fct_recode`

x <- factor(c("apple", "bear", "banana", "dear"))
fct_recode(x, fruit = "apple", fruit = "banana")
## [1] fruit bear  fruit dear 
## Levels: fruit bear dear

`fct_relabel`

iris$Species%>%fct_relabel(stringr::str_to_upper)%>%table()
## .
##     SETOSA VERSICOLOR  VIRGINICA 
##         50         50         50

Collapse or lump Levels

`fct_collapse`

x<-wday(seq(ymd("2024/1/1"), ymd("2024/1/7"), by = "1 day"), label = TRUE, week_start = 1)
x
## [1] Mon Tue Wed Thu Fri Sat Sun
## Levels: Mon < Tue < Wed < Thu < Fri < Sat < Sun
lvl=levels(x)
fct_collapse(x,weekdays=lvl[1:5],weekends=lvl[6:7])
## [1] weekdays weekdays weekdays weekdays weekdays weekends weekends
## Levels: weekdays < weekends

`fct_lump`

fct_lump_min(): Lumps levels that appear fewer than min times.

x <- factor(rep(LETTERS[1:9], times = c(40, 10, 5, 27, 1, 1, 1, 1, 1)))
x %>%
  fct_lump_min(5) %>%
  table()
## .
##     A     B     C     D Other 
##    40    10     5    27     5

fct_lump_prop(): Lumps levels that appear in fewer than (or equal to) prop * n times.

x %>%
  fct_lump_prop(0.10) %>%
  table()
## .
##     A     B     D Other 
##    40    10    27    10

fct_lump_n(): Lumps all levels except for the n most frequent (or least frequent if n < 0).

x %>%
  fct_lump_n(3) %>%
  table()
## .
##     A     B     D Other 
##    40    10    27    10

fct_lump_lowfreq(): Lumps together the least frequent levels, ensuring that “other” is still the smallest.

x %>%
  fct_lump_lowfreq() %>%
  table()
## .
##     A     D Other 
##    40    27    20

`fct_other()`

fct_other(): Manually replace levels with “other”

x%>%fct_other(keep = c("A", "B"))%>%table()
## .
##     A     B Other 
##    40    10    37
x%>%fct_other(drop = c("A", "B"))%>%table()
## .
##     C     D     E     F     G     H     I Other 
##     5    27     1     1     1     1     1    50

Add or Subtract Levels

`fct_expand`

f <- factor(letters[1:3])
fct_expand(f, "d", "e", "f")
## [1] a b c
## Levels: a b c d e f

`fct_drop`

f <- factor(c("a", "b"), levels = c("a", "b", "c"))
f%>%fct_drop()
## [1] a b
## Levels: a b

在本页

副教授

我的研究聚焦在机器学习和统计学的交叉领域，专注于分析高维数据的方法和算法。