Day6-庚帥博方便面

dplyr包

read.file() Copy from rdocumentation website
Usage

read.file(file=NULL,header=TRUE,use.value.labels=FALSE,to.data.frame=TRUE,sep=",",widths=NULL,f=NULL, filetype=NULL,...)

Copy from dplyr overview

dplyr五個基礎(chǔ)函數(shù)

新增列

> mutate(test, new = Sepal.Length * Sepal.Width)
  Sepal.Length Sepal.Width Petal.Length Petal.Width    Species   new
1          5.1         3.5          1.4         0.2     setosa 17.85
2          4.9         3.0          1.4         0.2     setosa 14.70
3          7.0         3.2          4.7         1.4 versicolor 22.40
4          6.4         3.2          4.5         1.5 versicolor 20.48
5          6.3         3.3          6.0         2.5  virginica 20.79
6          5.8         2.7          5.1         1.9  virginica 15.66
> test
    Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
1            5.1         3.5          1.4         0.2     setosa
2            4.9         3.0          1.4         0.2     setosa
51           7.0         3.2          4.7         1.4 versicolor
52           6.4         3.2          4.5         1.5 versicolor
101          6.3         3.3          6.0         2.5  virginica
102          5.8         2.7          5.1         1.9  virginica

這里test并沒有加入new這個variable,rowname也不同,只是一個臨時變量
mutate(.data, ...) adds new variables that are functions of existing variable
The arguments in ... are automatically quoted and evaluated in the context of the data frame.

按列篩選

select() picks variables based on their names.
使用colmumn number選擇列

> select(test,1)
    Sepal.Length
1            5.1
2            4.9
51           7.0
52           6.4
101          6.3
102          5.8
> select(test,c(1,5))
    Sepal.Length    Species
1            5.1     setosa
2            4.9     setosa
51           7.0 versicolor
52           6.4 versicolor
101          6.3  virginica
102          5.8  virginica

使用variable name選擇列

> select(test, Petal.Length, Petal.Width)
    Petal.Length Petal.Width
1            1.4         0.2
2            1.4         0.2
51           4.7         1.4
52           4.5         1.5
101          6.0         2.5
102          5.1         1.9
> vars <- c("Petal.Length", "Petal.Width")
> select(test, one_of(vars))
    Petal.Length Petal.Width
1            1.4         0.2
2            1.4         0.2
51           4.7         1.4
52           4.5         1.5
101          6.0         2.5
102          5.1         1.9

one_of(): Matches variable names in a character vector.

Q:one_of()同樣是dplyr中的function;這里使用這種麻煩的argument有什么實(shí)際意義嘛?

篩選行

filter() picks cases based on their values.

> filter(test, Species == "setosa")
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1          5.1         3.5          1.4         0.2  setosa
2          4.9         3.0          1.4         0.2  setosa

filter(.data, ..., .preserve = FALSE)
Use filter() to choose rows/cases where conditions are true. Unlike base subsetting with [, rows where the condition evaluates to NA are dropped.
Logical predicates defined in terms of the variables in .data. Multiple conditions are combined with &. Only rows where the condition evaluates to TRUE are kept.
The arguments in ... are automatically quoted and evaluated in the context of the data frame.

按某1列或某幾列對整個表格進(jìn)行排序

arrange(.data, ...) changes the ordering of the rows.
...: Comma separated list of unquoted variable names, or expressions involving variable names. Use desc() to sort a variable in descending order.

> arrange(test, Sepal.Length)
  Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
1          4.9         3.0          1.4         0.2     setosa
2          5.1         3.5          1.4         0.2     setosa
3          5.8         2.7          5.1         1.9  virginica
4          6.3         3.3          6.0         2.5  virginica
5          6.4         3.2          4.5         1.5 versicolor
6          7.0         3.2          4.7         1.4 versicolor
> arrange(test, desc(Sepal.Length))
  Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
1          7.0         3.2          4.7         1.4 versicolor
2          6.4         3.2          4.5         1.5 versicolor
3          6.3         3.3          6.0         2.5  virginica
4          5.8         2.7          5.1         1.9  virginica
5          5.1         3.5          1.4         0.2     setosa
6          4.9         3.0          1.4         0.2     setosa
//這里應(yīng)該是先按照Sepal.Length升序排列,相同的按照Sepal.Width降序排列
> arrange(test, Sepal.Length, desc(Sepal.Width))
  Sepal.Length Sepal.Width Petal.Length Petal.Width    Species
1          4.9         3.0          1.4         0.2     setosa
2          5.1         3.5          1.4         0.2     setosa
3          5.8         2.7          5.1         1.9  virginica
4          6.3         3.3          6.0         2.5  virginica
5          6.4         3.2          4.5         1.5 versicolor
6          7.0         3.2          4.7         1.4 versicolor

匯總

summarise() reduces multiple values down to a single summary.

> summarise(test, mean(Sepal.Length), sd(Sepal.Length))# 計(jì)算Sepal.Length的平均值和標(biāo)準(zhǔn)差
  mean(Sepal.Length) sd(Sepal.Length)
1           5.916667        0.8084965
> # 先按照Species分組,計(jì)算每組Sepal.Length的平均值和標(biāo)準(zhǔn)差group_by(test, Species)
> group_by(test, Species)
# A tibble: 6 x 5
# Groups:   Species [3]
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species   
*        <dbl>       <dbl>        <dbl>       <dbl> <fct>     
1          5.1         3.5          1.4         0.2 setosa    
2          4.9         3            1.4         0.2 setosa    
3          7           3.2          4.7         1.4 versicolor
4          6.4         3.2          4.5         1.5 versicolor
5          6.3         3.3          6           2.5 virginica 
6          5.8         2.7          5.1         1.9 virginica 

group_by(.data, ..., add = FALSE, .drop = group_by_drop_default(.data))
.data a tbl
... Variables to group by. All tbls accept variable names. Some tbls will accept functions of variables. Duplicated groups will be silently dropped.
add When add = FALSE, the default, group_by() will override existing groups. To add to the existing groups, use add = TRUE.
.drop When .drop = TRUE, empty groups are dropped. See group_by_drop_default() for what the default value is for this argument.

dplyr兩個實(shí)用技能

test1
##   x z
## 1 b A
## 2 e B
## 3 f C
## 4 x D
test2 
##   x y
## 1 a 1
## 2 b 2
## 3 c 3
## 4 d 4
## 5 e 5
## 6 f 6

管道操作 %>% (cmd/ctr + shift + M)

> ?%>%
Error: unexpected SPECIAL in "?%>%"

報(bào)錯了~

以下Copy來的文檔 Piping 中的例子

The dplyr API is functional in the sense that function calls don’t have side-effects. You must always save their results. This doesn’t lead to particularly elegant code, especially if you want to do many operations at once. You either have to do it step-by-step:

a1 <- group_by(flights, year, month, day)
a2 <- select(a1, arr_delay, dep_delay)
a3 <- summarise(a2,
  arr = mean(arr_delay, na.rm = TRUE),
  dep = mean(dep_delay, na.rm = TRUE))
a4 <- filter(a3, arr > 30 | dep > 30)

Or if you don’t want to name the intermediate results, you need to wrap the function calls inside each other:

filter(
  summarise(
    select(
      group_by(flights, year, month, day),
      arr_delay, dep_delay
    ),
    arr = mean(arr_delay, na.rm = TRUE),
    dep = mean(dep_delay, na.rm = TRUE)
  ),
  arr > 30 | dep > 30
)
#> Adding missing grouping variables: `year`, `month`, `day`
#> # A tibble: 49 x 5
#> # Groups:   year, month [11]
#>    year month   day   arr   dep
#>   <int> <int> <int> <dbl> <dbl>
#> 1  2013     1    16  34.2  24.6
#> 2  2013     1    31  32.6  28.7
#> 3  2013     2    11  36.3  39.1
#> 4  2013     2    27  31.3  37.8
#> # … with 45 more rows

This is difficult to read because the order of the operations is from inside to out. Thus, the arguments are a long way away from the function. To get around this problem, dplyr provides the %>% operator from magrittr. x %>% f(y) turns into f(x, y) so you can use it to rewrite multiple operations that you can read left-to-right, top-to-bottom:

flights %>%
  group_by(year, month, day) %>%
  select(arr_delay, dep_delay) %>%
  summarise(
    arr = mean(arr_delay, na.rm = TRUE),
    dep = mean(dep_delay, na.rm = TRUE)
  ) %>%
  filter(arr > 30 | dep > 30)

個人感覺還是一步一步來比較不容易出錯~

count統(tǒng)計(jì)某列的unique值

其實(shí)就是把factor枚舉出來 并給出個數(shù)

count(test,Species)
## # A tibble: 3 x 2
##   Species        n
##         
## 1 setosa         2
## 2 versicolor     2
## 3 virginica      2

dplyr處理關(guān)系數(shù)據(jù)

Refer to Join two tbls together
這里把用到的兩大類join的功能copy過來
兩大類分別是:mutating join 和 filtering join
function的原型抽象為xxx_join(x, y, by = NULL, copy = FALSE, ...)
例子來源于生信星球

Mutating joins combine variables from the two data.frames:

  inner_join()
    return all rows from x where there are matching values in y, and all columns from x and y. If there are multiple matches between x and y, all combination of the matches are returned.

  left_join()
    return all rows from x, and all columns from x and y. Rows in x with no match in y will have NA values in the new columns. If there are multiple matches between x and y, all combinations of the matches are returned.

  right_join()
    return all rows from y, and all columns from x and y. Rows in y with no match in x will have NA values in the new columns. If there are multiple matches between x and y, all combinations of the matches are returned.

  full_join()
    return all rows and all columns from both x and y. Where there are not matching values, returns NA for the one missing.

Filtering joins keep cases from the left-hand data.frame:

  semi_join()
    return all rows from x where there are matching values in y, keeping just columns from x. A semi join differs from an inner join because an inner join will return one row of x for each matching row of y, where a semi join will never duplicate rows of x.

  anti_join()
    return all rows from x where there are not matching values in y, keeping just columns from x.

內(nèi)連inner_join,取交集

inner_join(test1, test2, by = "x")
##   x z y
## 1 b A 2
## 2 e B 5
## 3 f C 6

左連left_join

left_join(test1, test2, by = 'x')
##   x z  y
## 1 b A  2
## 2 e B  5
## 3 f C  6
## 4 x D NA
left_join(test2, test1, by = 'x')
##   x y    z
## 1 a 1 
## 2 b 2    A
## 3 c 3 
## 4 d 4 
## 5 e 5    B
## 6 f 6    C

全連full_join

full_join( test1, test2, by = 'x')
##   x    z  y
## 1 b    A  2
## 2 e    B  5
## 3 f    C  6
## 4 x    D NA
## 5 a   1
## 6 c   3
## 7 d   4

半連接:返回能夠與y表匹配的x表所有記錄semi_join

semi_join(x = test1, y = test2, by = 'x')
##   x z
## 1 b A
## 2 e B
## 3 f C

反連接:返回?zé)o法與y表匹配的x表的所記錄anti_join

anti_join(x = test2, y = test1, by = 'x')
##   x y
## 1 a 1
## 2 c 3
## 3 d 4

簡單合并

test1 <- data.frame(x = c(1,2,3,4), y = c(10,20,30,40))
test1
##   x  y
## 1 1 10
## 2 2 20
## 3 3 30
## 4 4 40
test2 <- data.frame(x = c(5,6), y = c(50,60))
test2
##   x  y
## 1 5 50
## 2 6 60
test3 <- data.frame(z = c(100,200,300,400))
test3
##     z
## 1 100
## 2 200
## 3 300
## 4 400
bind_rows(test1, test2)
##   x  y
## 1 1 10
## 2 2 20
## 3 3 30
## 4 4 40
## 5 5 50
## 6 6 60
bind_cols(test1, test3)
##   x  y   z
## 1 1 10 100
## 2 2 20 200
## 3 3 30 300
## 4 4 40 400

所以既然有自帶cbind() rbind() dplyr包要設(shè)計(jì)這種function出來

最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容