
鏈接:
- 代碼:https://github.com/Z3tt/TidyTuesday/blob/master/R/2020_16_BestRapArtists.Rmd
- 數(shù)據(jù):https://github.com/rfordatascience/tidytuesday/tree/master/data/2020/2020-04-14
此圖講解
加載包
rm(list = ls())
library(tidyverse)
library(patchwork)
library(ggtext)
library(showtext)
# install.packages("nord")
library(nord)
全局主題設(shè)置
theme_set(theme_minimal(base_family = "Arial"))
theme_update(axis.text.x = element_text(size = 11, color = "grey20"),
axis.text.y = element_text(size = 13, color = "black", face = "bold"),
axis.ticks.x = element_line(color = "grey45"),
axis.ticks.y = element_blank(),
axis.ticks.length.x = unit(.4, "lines"),
panel.grid = element_blank(),
plot.background = element_rect(fill = "grey60", color = "grey60"))
數(shù)據(jù)清洗部分函數(shù)
涉及相關(guān)函數(shù)
- str_split()
- unnest()
- mutate() 與 if_else、ifelse
- fct_relevel
str_split():
按照指定字符分割數(shù)據(jù),至于這里為什么要加這么多分割字符,這就是數(shù)據(jù)的特殊性了,我們自己的數(shù)據(jù)要分割什么肯定清楚。
> str_split(c("Grandmaster Flash & The Furious Five"), " & | ft. | feat. | feat | and ")
[[1]]
[1] "Grandmaster Flash" "The Furious Five"
> str_split(c("Kanye West ft. Rihanna & Kid Cudi"), " & | ft. | feat. | feat | and ")
[[1]]
[1] "Kanye West" "Rihanna" "Kid Cudi"
unnest():
> df_ranks %>%
mutate(
# str_split 對(duì)數(shù)據(jù)進(jìn)行分割
artists = str_split(artist, " & | ft. | feat. | feat | and ")
) %>%
select(ID, artists) %>%
slice(5)
# A tibble: 1 x 2
ID artists
<dbl> <list>
1 5 <chr [2]>
Rsudio 中查看是這樣的

Rstudio 中顯示是這樣
使用 unnest() 后:
df_ranks %>%
mutate(
# str_split 對(duì)數(shù)據(jù)進(jìn)行分割
artists = str_split(artist, " & | ft. | feat. | feat | and ")
) %>%
select(ID, artists) %>%
slice(5) %>%
unnest(artists)
# A tibble: 2 x 2
ID artists
<dbl> <chr>
1 5 Dr Dre
2 5 Snoop Doggy Dogg
分組函數(shù):mutate() 與 if_else()(同 ifelse)
當(dāng)我們分組在兩組時(shí)候,用 if_else 或者 ifelse 比較方便,當(dāng)超過兩個(gè)時(shí)候選擇用 mutate() 更加方便。
mutate(
era = case_when(
year >= 1973 & year < 1985 ~ "Old-school DJ Era",
year >= 1985 & year < 1997 ~ "Golden Age",
year >= 1997 & year < 2009 ~ "Bling-Bling Era",
year >= 2009 ~ "Internet Era",
TRUE ~ "other"
),
artists = if_else(artists == "Snoop Doggy Dogg", "Snoop Dogg", artists),
artists = if_else(artists == "JAY-Z", "Jay-Z", artists),
artists = if_else(artists == "Outkast", "OutKast", artists)
)
n_distinct:
uniq ID 數(shù)目,在 tidyverse 包中常與 group_by() 連用
> x <- sample(1:10, 1e5, rep = TRUE)
> length(unique(x))
[1] 10
> n_distinct(x)
[1] 10
fct_relevel:
重新定義因子水平,更多內(nèi)容可以參考 《R for data science》這本書 forcats 章節(jié)
> f <- factor(c("a", "b", "c", "d"), levels = c("b", "c", "d", "a"))
> fct_relevel(f)
[1] a b c d
Levels: b c d a
將 a 優(yōu)先級(jí)放置最前面
> fct_relevel(f, "a")
[1] a b c d
Levels: a b c d
將 b 和 a 優(yōu)先級(jí)放置最前面
> fct_relevel(f, "b", "a")
[1] a b c d
Levels: b a c d
因子排序
> fct_relevel(f, sort)
[1] a b c d
Levels: a b c d
逆向因子
> fct_relevel(f, rev)
[1] a b c d
Levels: a d c b
更多內(nèi)容直接在 R 中 ?fct_relevel() 查看就好,
本例中是按照 sum_points 像升序,然后按照 best 進(jìn)行降序,最后再取 unique 的 artists 類型作為因子:
fct_relevel(factor(artists,
levels = unique( artists[order(sum_points, -best)]) )
)
# 數(shù)據(jù)鏈接:https://github.com/rfordatascience/tidytuesday/tree/master/data/2020/2020-04-14/rankings.csv
df_ranks <- readr::read_csv('2020-04-14rankings.csv')
df_ranks_era <-
df_ranks %>%
mutate(
# str_split 對(duì)數(shù)據(jù)進(jìn)行分割
artists = str_split(artist, " & | ft. | feat. | feat | and ")
) %>%
unnest(artists) %>%
mutate(
era = case_when(
year >= 1973 & year < 1985 ~ "Old-school DJ Era",
year >= 1985 & year < 1997 ~ "Golden Age",
year >= 1997 & year < 2009 ~ "Bling-Bling Era",
year >= 2009 ~ "Internet Era",
TRUE ~ "other"
),
artists = if_else(artists == "Snoop Doggy Dogg", "Snoop Dogg", artists),
artists = if_else(artists == "JAY-Z", "Jay-Z", artists),
artists = if_else(artists == "Outkast", "OutKast", artists)
) %>%
group_by(artists) %>%
mutate(
n_songs = n_distinct(ID),
sum_points = sum(points),
ID = as.numeric(ID),
best = min(ID)
) %>%
filter(
best <= 75,
!artists %in% c("Dido", "Rihanna")
) %>%
ungroup() %>%
arrange(ID) %>%
mutate(
artists = fct_relevel(factor(artists, levels = unique(artists[order(sum_points, -best)]))),
ID = as.numeric(as.factor(ID))
) %>%
arrange(artists)
繪圖部分函數(shù)
- geom_col()
- geom_curve()
- annotate("text")
- coord_flip()
- nord::scale_fill_nord()
- scale_x_discrete() 與 scale_y_continuous()
- theme()
- scale_size()
- patchwork::plot_layout()
nord
一個(gè)配色包,本文用的到配色主題
https://cran.r-project.org/web/packages/nord/readme/README.html

修改 xy 軸 label 位置
這里由于之前進(jìn)行了 coord_flip() ,將 xy 軸互換了。
scale_x_discrete(position = "top") + # label 至于最上,這里相當(dāng)于將互換后 y 軸放置到最右邊
scale_y_continuous(expand = c(.02, .02),
limits = c(-200, 0),
breaks = seq(-175, 0, by = 25),
labels = rev(c(seq(0, 150, by = 25), "175 points")),
position = "right" # 轉(zhuǎn)置后,x 放置于最上面
)
theme
theme 中有一個(gè)參數(shù) plot.margin 是用來控制與邊的距離,之前提到過,
plot.margin:控制上下左右邊距(上,左,下,右),拼圖要善用此函數(shù)
theme(axis.text.y.right = element_text(hjust = .5), # 表示 y 軸 label 劇中對(duì)齊
plot.margin = margin(5, 0, 5, 5))
axis.ticks.x = element_blank() 去除 x 軸刻度尺
axis.ticks.y = element_blank() 去除 y 軸刻度尺
scale_size
重新標(biāo)準(zhǔn)化 ggplot2 中 size 映射的大小
scale_size(range = c(2, 5.5), guide = F)
patchwork
拼圖神器,這里用函數(shù) plot_layout 函數(shù)中的參數(shù) widths 來控制兩個(gè)圖的寬度比。
bars + dots + plot_layout(widths = c(1, .35))
左側(cè)的 barplot
cols <- c("grey60", "#ffc205", "#cecece", "#4e8863")
bars <-
df_ranks_era %>%
ggplot(aes(artists, -points)) +
geom_col(aes(fill = ID),
color = "white",
size = .5,
width = 1.02) +
geom_curve(aes(x = 51.2, xend = 47,
y = -148, yend = -166),
curvature = -.4) +
annotate("text", x = 47, y = -185,
label = "Each rectangle represents\none song included in the\nBBC ranking, its length\n the total points and the\ncolor indicates the rank",
family = "Arial",
size = 3.8,
lineheight = .9) +
annotate("text", x = 21.5, y = -120,
label = 'The Top Artists featured in the BBC′s\n"Greatest Hip-Hop Songs of All Time"',
family = "Arial",
fontface = "bold",
size = 12,
lineheight = .9) +
annotate("text", x = 17, y = -120,
label = 'In Autumn 2019, 108 hip-hop and music experts ranked their 5 favorites out of\n311 nominated songs in an online survey by the BBC. The graphic shows points\nscored in total and per song for the top ranked artists and broken down by era.',
family = "Arial",
fontface = "bold",
color = "grey30",
size = 5.5,
lineheight = .9) +
coord_flip() +
scale_x_discrete(position = "top") +
scale_y_continuous(expand = c(.02, .02),
limits = c(-200, 0),
breaks = seq(-175, 0, by = 25),
labels = rev(c(seq(0, 150, by = 25), "175 points")),
position = "right") +
nord::scale_fill_nord(palette = "halifax_harbor",
discrete = F,
reverse = F,
guide = F) +
theme(axis.text.y.right = element_text(hjust = .5),
plot.margin = margin(5, 0, 5, 5)) +
labs(x = NULL, y = NULL)

右側(cè)的氣泡圖
dots <-
df_ranks_era %>%
group_by(artists, era) %>%
summarize(
n_songs = n_distinct(ID),
best = min(ID)
) %>%
ungroup() %>%
mutate(
era = factor(era, levels = c("Old-school DJ Era", "Golden Age", "Bling-Bling Era", "Internet Era")),
era_num = as.numeric(era)
) %>%
ggplot(aes(artists, era_num, group = artists)) +
geom_point(aes(artists, 1), color = "grey75", size = 2) +
geom_point(aes(artists, 2), color = "grey75", size = 2) +
geom_point(aes(artists, 3), color = "grey75", size = 2) +
geom_point(aes(artists, 4), color = "grey75", size = 2) +
geom_segment(aes(x = artists, xend = artists,
y = 1, yend = 4),
color = "grey75",
size = .3) +
geom_line(color = "black",
size = .9) +
geom_point(aes(fill = best, size = n_songs),
shape = 21,
color = "black",
stroke = 1.2) +
geom_curve(aes(x = 47, xend = 51,
y = 6.1, yend = 4.3),
curvature = .4) +
annotate("text", x = 45.1, y = 6.1,
label = "The dot size indicates\nthe number of songs,\nthe dot color the best\nrank in each era",
family = "Chivo",
size = 3.8,
lineheight = .9) +
coord_flip() +
scale_y_continuous(limits = c(.5, 7.3),
breaks = 1:4,
labels = c("Old-School Era ('73-'84)",
"Golden Age ('85-'96)",
"Bling-Bling Era ('97-'09)",
"Internet Era ('09-'19)"),
position = "right") +
scale_size(range = c(2, 5.5), guide = F) +
nord::scale_fill_nord(palette = "halifax_harbor",
discrete = F,
reverse = F,
guide = F,
limits = c(min(df_ranks_era$ID), max(df_ranks_era$ID))) +
theme(axis.ticks.x = element_blank(),
axis.ticks.y = element_blank(),
axis.text.x = element_text(size = 11, face = "bold",
hjust = .1, vjust = 0, angle = 20),
axis.text.y = element_blank(),
plot.margin = margin(5, 5, 5, 0),
plot.caption = element_text(face = "bold", color = "grey30",
size = 10, margin = margin(t = 15))) +
labs(x = NULL, y = NULL,
caption = "Visualization by Cédric Scherer ? Data by BBC Music")

patchwork 拼接
bars + dots + plot_layout(widths = c(1, .35))

copy 跑完,實(shí)在太強(qiáng)了,出神入化。