數(shù)據(jù)處理
> # 抽取數(shù)據(jù)去重復(fù)
> de_dup <- function()
+ {
+ i <- which(duplicated(iris))
+ x <- iris[-i, ]
+ #print(x)
+ }
> head(de_dup())
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5.0 3.6 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa
#或者
iris[!duplicated(iris), ]
去掉NA
> head(airquality[complete.cases(airquality), ])
Ozone Solar.R Wind Temp Month Day
1 41 190 7.4 67 5 1
2 36 118 8.0 72 5 2
3 12 149 12.6 74 5 3
4 18 313 11.5 62 5 4
7 23 299 8.6 65 5 7
8 19 99 13.8 59 5 8
> # 或者na.omit(airquality)
with() identical() within()函數(shù)
> # 用with()函數(shù)計(jì)算鳶尾花,花萼與花瓣的長(zhǎng)度比
> rwith <- with(iris, Sepal.Length / Petal.Length)
> head(rwith)
[1] 3.642857 3.500000 3.615385 3.066667 3.571429 3.176471
>
> # identical()基本作用是檢測(cè)兩個(gè)對(duì)象是否完全相同,相同返回T,否則,F(xiàn)
>
> # within函數(shù)與with類似,但主要用于列運(yùn)算,將運(yùn)算結(jié)果放入新列
> myiris <- iris # 不破壞內(nèi)建數(shù)據(jù)集
> myiris <- within(myiris, lenth.ratio <- Sepal.Length / Petal.Length)
> head(myiris)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species lenth.ratio
1 5.1 3.5 1.4 0.2 setosa 3.642857
2 4.9 3.0 1.4 0.2 setosa 3.500000
3 4.7 3.2 1.3 0.2 setosa 3.615385
4 4.6 3.1 1.5 0.2 setosa 3.066667
5 5.0 3.6 1.4 0.2 setosa 3.571429
6 5.4 3.9 1.7 0.4 setosa 3.176471
分割數(shù)據(jù)
> # 分割數(shù)據(jù)
> # cut()將數(shù)據(jù)等量切割,處理后的數(shù)據(jù)是factor數(shù)據(jù)型態(tài)
> # 將state.77對(duì)象依人口數(shù)做分割,分成5等份
> popu <- state.x77[, "Population"]
> cutpopu <- cut(popu, 5)
> head(cutpopu)
[1] (344,4.53e+03] (344,4.53e+03] (344,4.53e+03] (344,4.53e+03] (1.7e+04,2.12e+04] (344,4.53e+03]
Levels: (344,4.53e+03] (4.53e+03,8.7e+03] (8.7e+03,1.29e+04] (1.29e+04,1.7e+04] (1.7e+04,2.12e+04]
> #分割時(shí),按人口數(shù)由多到少,分別給予名稱"high" "2nd" "3rd" "4th" "low"
> cut(popu, 5, labels = c ("high", "2nd", "3rd", "4th", "low"))
[1] high high high high low high high high 2nd 2nd high high 3rd 2nd high high high high high high 2nd 3rd high high 2nd high high high high 2nd high low
[33] 2nd high 3rd high high 3rd high high high high 3rd high high 2nd high high 2nd high
Levels: high 2nd 3rd 4th low
>
> #要了解每一人口數(shù)分類有多少州
> x.popu <- cut(popu, 5, labels = c ("high", "2nd", "3rd", "4th", "low"))
> table(x.popu)
x.popu
high 2nd 3rd 4th low
34 9 5 0 2
合并數(shù)據(jù)
準(zhǔn)備數(shù)據(jù)庫(kù)
> mystates.x77 <- as.data.frame(state.x77)
> mystates.x77$name <- rownames(state.x77) # 給新數(shù)據(jù)增加一個(gè)字段name
> head(mystates.x77)
Population Income Illiteracy Life Exp Murder HS Grad Frost Area name
Alabama 3615 3624 2.1 69.05 15.1 41.3 20 50708 Alabama
Alaska 365 6315 1.5 69.31 11.3 66.7 152 566432 Alaska
Arizona 2212 4530 1.8 70.55 7.8 58.1 15 113417 Arizona
Arkansas 2110 3378 1.9 70.66 10.1 39.9 65 51945 Arkansas
California 21198 5114 1.1 71.71 10.3 62.6 20 156361 California
Colorado 2541 4884 0.7 72.06 6.8 63.9 166 103766 Colorado
> row.names(mystates.x77) <- NULL # 刪除原來行名
> head(mystates.x77)
Population Income Illiteracy Life Exp Murder HS Grad Frost Area name
1 3615 3624 2.1 69.05 15.1 41.3 20 50708 Alabama
2 365 6315 1.5 69.31 11.3 66.7 152 566432 Alaska
3 2212 4530 1.8 70.55 7.8 58.1 15 113417 Arizona
4 2110 3378 1.9 70.66 10.1 39.9 65 51945 Arkansas
5 21198 5114 1.1 71.71 10.3 62.6 20 156361 California
6 2541 4884 0.7 72.06 6.8 63.9 166 103766 Colorado
> #人口大于500萬(wàn)的選出來(原單位是千人數(shù)),同時(shí)新對(duì)象要有2個(gè)字段name 和 population
> mypopu.states <- mystates.x77[mystates.x77$Population > 5000, c("name", "Population")]
> mypopu.states
name Population
5 California 21198
9 Florida 8277
13 Illinois 11197
14 Indiana 5313
21 Massachusetts 5814
22 Michigan 9111
30 New Jersey 7333
32 New York 18076
33 North Carolina 5441
35 Ohio 10735
38 Pennsylvania 11860
43 Texas 12237
> #選出月收入大于5000美元的。同時(shí)新對(duì)象要有2個(gè)字段name 和 Income
> myincomes.states <- mystates.x77[mystates.x77$Income > 5000, c("name", "Income")]
> myincomes.states
name Income
2 Alaska 6315
5 California 5114
7 Connecticut 5348
13 Illinois 5107
20 Maryland 5299
28 Nevada 5149
30 New Jersey 5237
34 North Dakota 5087
merge
> # merge()交集合并。merge(x, y, all = F),默認(rèn)是交接合并
> # 合并上述兩個(gè)數(shù)據(jù)中人數(shù)超500萬(wàn)的州和月收入超5000美元的州
> merge(mypopu.states, myincomes.states)
name Population Income
1 California 21198 5114
2 Illinois 11197 5107
3 New Jersey 7333 5237
>
> # 取并集
> merge(mypopu.states, myincomes.states, all = T)
name Population Income
1 Alaska NA 6315
2 California 21198 5114
3 Connecticut NA 5348
4 Florida 8277 NA
5 Illinois 11197 5107
6 Indiana 5313 NA
7 Maryland NA 5299
8 Massachusetts 5814 NA
9 Michigan 9111 NA
10 Nevada NA 5149
11 New Jersey 7333 5237
12 New York 18076 NA
13 North Carolina 5441 NA
14 North Dakota NA 5087
15 Ohio 10735 NA
16 Pennsylvania 11860 NA
17 Texas 12237 NA
>
> # merge參數(shù)all.x = T, 保證第一個(gè)對(duì)象的元素在合并中都存在,第二個(gè)如沒有則NA填充
> merge(mypopu.states, myincomes.states, all.x = T)
name Population Income
1 California 21198 5114
2 Florida 8277 NA
3 Illinois 11197 5107
4 Indiana 5313 NA
5 Massachusetts 5814 NA
6 Michigan 9111 NA
7 New Jersey 7333 5237
8 New York 18076 NA
9 North Carolina 5441 NA
10 Ohio 10735 NA
11 Pennsylvania 11860 NA
12 Texas 12237 NA
match
> # match()類似于取兩個(gè)對(duì)象交集,即第一對(duì)象x的某行數(shù)據(jù)若在第二個(gè)對(duì)象y中找到符合條件的數(shù)據(jù),則返回第二個(gè)對(duì)象中
> # 相應(yīng)數(shù)據(jù)的位置,否則返回NA。所以match后會(huì)返回一個(gè)與第一個(gè)對(duì)象長(zhǎng)度相同的向量。
>
> # 找出符合人口數(shù)多于500萬(wàn),同時(shí)月授予超5000美元的行數(shù)據(jù),在對(duì)象myincomes.states中的位置,返回的向量數(shù)值即是要的結(jié)果。
> my.index <- match(mypopu.states$name, myincomes.states$name)
> my.index
[1] 2 NA 4 NA NA NA 7 NA NA NA NA NA
>
> # 提取出myincome.states中人口數(shù)多于500萬(wàn),同時(shí)月收入超5000美元的州的數(shù)據(jù)。
> myincomes.states[na.omit(my.index), ]
name Income
5 California 5114
13 Illinois 5107
30 New Jersey 5237
>
> # %in%將返回于第一個(gè)對(duì)象長(zhǎng)度相同的邏輯向量,在向量中為T的元素是我們要的數(shù)據(jù)
> my.index2 <- mypopu.states$name %in% myincomes.states$name
> my.index2
[1] TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
>
> #抽出mypopu.states中人口多于500萬(wàn),同時(shí)月收入過5000美元的州數(shù)據(jù)
> mypopu.states[my.index2, ]
name Population
5 California 21198
13 Illinois 11197
30 New Jersey 7333
>
> # 換種做法
> my.index <- match(mypopu.states$name, myincomes.states$name)
> my.index3 <- !is.na(my.index) #my.index中不是NA的賦值給my.index3
> my.index3
[1] TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
> mypopu.states[my.index3, ]
name Population
5 California 21198
13 Illinois 11197
30 New Jersey 7333
排序
> # 排序sort/order
> # 數(shù)據(jù)框的排序,對(duì)state.info數(shù)據(jù)框依據(jù)Income字段執(zhí)行升序排列。
> mystate.info <- data.frame(Region = state.region, state.x77)
> mystate.info
Region Population Income Illiteracy Life.Exp Murder HS.Grad Frost Area
Alabama South 3615 3624 2.1 69.05 15.1 41.3 20 50708
Alaska West 365 6315 1.5 69.31 11.3 66.7 152 566432
Arizona West 2212 4530 1.8 70.55 7.8 58.1 15 113417
Arkansas South 2110 3378 1.9 70.66 10.1 39.9 65 51945
California West 21198 5114 1.1 71.71 10.3 62.6 20 156361
Colorado West 2541 4884 0.7 72.06 6.8 63.9 166 103766
Connecticut Northeast 3100 5348 1.1 72.48 3.1 56.0 139 4862
Delaware South 579 4809 0.9 70.06 6.2 54.6 103 1982
Florida South 8277 4815 1.3 70.66 10.7 52.6 11 54090
Georgia South 4931 4091 2.0 68.54 13.9 40.6 60 58073
Hawaii West 868 4963 1.9 73.60 6.2 61.9 0 6425
Idaho West 813 4119 0.6 71.87 5.3 59.5 126 82677
Illinois North Central 11197 5107 0.9 70.14 10.3 52.6 127 55748
Indiana North Central 5313 4458 0.7 70.88 7.1 52.9 122 36097
Iowa North Central 2861 4628 0.5 72.56 2.3 59.0 140 55941
Kansas North Central 2280 4669 0.6 72.58 4.5 59.9 114 81787
Kentucky South 3387 3712 1.6 70.10 10.6 38.5 95 39650
Louisiana South 3806 3545 2.8 68.76 13.2 42.2 12 44930
Maine Northeast 1058 3694 0.7 70.39 2.7 54.7 161 30920
Maryland South 4122 5299 0.9 70.22 8.5 52.3 101 9891
Massachusetts Northeast 5814 4755 1.1 71.83 3.3 58.5 103 7826
Michigan North Central 9111 4751 0.9 70.63 11.1 52.8 125 56817
Minnesota North Central 3921 4675 0.6 72.96 2.3 57.6 160 79289
Mississippi South 2341 3098 2.4 68.09 12.5 41.0 50 47296
Missouri North Central 4767 4254 0.8 70.69 9.3 48.8 108 68995
Montana West 746 4347 0.6 70.56 5.0 59.2 155 145587
Nebraska North Central 1544 4508 0.6 72.60 2.9 59.3 139 76483
Nevada West 590 5149 0.5 69.03 11.5 65.2 188 109889
New Hampshire Northeast 812 4281 0.7 71.23 3.3 57.6 174 9027
New Jersey Northeast 7333 5237 1.1 70.93 5.2 52.5 115 7521
New Mexico West 1144 3601 2.2 70.32 9.7 55.2 120 121412
New York Northeast 18076 4903 1.4 70.55 10.9 52.7 82 47831
North Carolina South 5441 3875 1.8 69.21 11.1 38.5 80 48798
North Dakota North Central 637 5087 0.8 72.78 1.4 50.3 186 69273
Ohio North Central 10735 4561 0.8 70.82 7.4 53.2 124 40975
Oklahoma South 2715 3983 1.1 71.42 6.4 51.6 82 68782
Oregon West 2284 4660 0.6 72.13 4.2 60.0 44 96184
Pennsylvania Northeast 11860 4449 1.0 70.43 6.1 50.2 126 44966
Rhode Island Northeast 931 4558 1.3 71.90 2.4 46.4 127 1049
South Carolina South 2816 3635 2.3 67.96 11.6 37.8 65 30225
South Dakota North Central 681 4167 0.5 72.08 1.7 53.3 172 75955
Tennessee South 4173 3821 1.7 70.11 11.0 41.8 70 41328
Texas South 12237 4188 2.2 70.90 12.2 47.4 35 262134
Utah West 1203 4022 0.6 72.90 4.5 67.3 137 82096
Vermont Northeast 472 3907 0.6 71.64 5.5 57.1 168 9267
Virginia South 4981 4701 1.4 70.08 9.5 47.8 85 39780
Washington West 3559 4864 0.6 71.72 4.3 63.5 32 66570
West Virginia South 1799 3617 1.4 69.48 6.7 41.6 100 24070
Wisconsin North Central 4589 4468 0.7 72.48 3.0 54.5 149 54464
Wyoming West 376 4566 0.6 70.29 6.9 62.9 173 97203
> head(mystate.info)
Region Population Income Illiteracy Life.Exp Murder HS.Grad Frost Area
Alabama South 3615 3624 2.1 69.05 15.1 41.3 20 50708
Alaska West 365 6315 1.5 69.31 11.3 66.7 152 566432
Arizona West 2212 4530 1.8 70.55 7.8 58.1 15 113417
Arkansas South 2110 3378 1.9 70.66 10.1 39.9 65 51945
California West 21198 5114 1.1 71.71 10.3 62.6 20 156361
Colorado West 2541 4884 0.7 72.06 6.8 63.9 166 103766
> state.info <- mystate.info[1:15, ]
> inc.order <- order(state.info$Income) # 默認(rèn)升序
> state.info[inc.order, ]
Region Population Income Illiteracy Life.Exp Murder HS.Grad Frost Area
Arkansas South 2110 3378 1.9 70.66 10.1 39.9 65 51945
Alabama South 3615 3624 2.1 69.05 15.1 41.3 20 50708
Georgia South 4931 4091 2.0 68.54 13.9 40.6 60 58073
Idaho West 813 4119 0.6 71.87 5.3 59.5 126 82677
Indiana North Central 5313 4458 0.7 70.88 7.1 52.9 122 36097
Arizona West 2212 4530 1.8 70.55 7.8 58.1 15 113417
Iowa North Central 2861 4628 0.5 72.56 2.3 59.0 140 55941
Delaware South 579 4809 0.9 70.06 6.2 54.6 103 1982
Florida South 8277 4815 1.3 70.66 10.7 52.6 11 54090
Colorado West 2541 4884 0.7 72.06 6.8 63.9 166 103766
Hawaii West 868 4963 1.9 73.60 6.2 61.9 0 6425
Illinois North Central 11197 5107 0.9 70.14 10.3 52.6 127 55748
California West 21198 5114 1.1 71.71 10.3 62.6 20 156361
Connecticut Northeast 3100 5348 1.1 72.48 3.1 56.0 139 4862
Alaska West 365 6315 1.5 69.31 11.3 66.7 152 566432
>
> # 排序是增加次要鍵值,格式,order(主要健值,次要鍵值,……)
> # 以state.info 數(shù)據(jù)框?yàn)槔瑢egion作為主要健值,Income作為次要健值,升序排。
> inc.order2 <- order(state.info$Region, state.info$Income)
> state.info[inc.order2, ]
Region Population Income Illiteracy Life.Exp Murder HS.Grad Frost Area
Connecticut Northeast 3100 5348 1.1 72.48 3.1 56.0 139 4862
Arkansas South 2110 3378 1.9 70.66 10.1 39.9 65 51945
Alabama South 3615 3624 2.1 69.05 15.1 41.3 20 50708
Georgia South 4931 4091 2.0 68.54 13.9 40.6 60 58073
Delaware South 579 4809 0.9 70.06 6.2 54.6 103 1982
Florida South 8277 4815 1.3 70.66 10.7 52.6 11 54090
Indiana North Central 5313 4458 0.7 70.88 7.1 52.9 122 36097
Iowa North Central 2861 4628 0.5 72.56 2.3 59.0 140 55941
Illinois North Central 11197 5107 0.9 70.14 10.3 52.6 127 55748
Idaho West 813 4119 0.6 71.87 5.3 59.5 126 82677
Arizona West 2212 4530 1.8 70.55 7.8 58.1 15 113417
Colorado West 2541 4884 0.7 72.06 6.8 63.9 166 103766
Hawaii West 868 4963 1.9 73.60 6.2 61.9 0 6425
California West 21198 5114 1.1 71.71 10.3 62.6 20 156361
Alaska West 365 6315 1.5 69.31 11.3 66.7 152 566432
> # 在排序結(jié)果中south在northeast和north central之間,錯(cuò)了嗎?這是由于state.region是一個(gè)因子,class()可知。
> # 對(duì)因子而言order的排序,相當(dāng)于是執(zhí)行l(wèi)evels排序,所以應(yīng)該小心。
>
> # 混合排序。部分字段升序排,部分字段降序排,用xtfrm(),可將原向量轉(zhuǎn)為數(shù)值向量,當(dāng)想要以不同方式排序時(shí),在xtfrm()前加上—即可
>
> #以state.info為例,將Region作為主要健值升序排,Income作次要健值降序排。
> mix.order <- order(state.info$Region, -xtfrm(state.info$Income))
> state.info[mix.order, ]
Region Population Income Illiteracy Life.Exp Murder HS.Grad Frost Area
Connecticut Northeast 3100 5348 1.1 72.48 3.1 56.0 139 4862
Florida South 8277 4815 1.3 70.66 10.7 52.6 11 54090
Delaware South 579 4809 0.9 70.06 6.2 54.6 103 1982
Georgia South 4931 4091 2.0 68.54 13.9 40.6 60 58073
Alabama South 3615 3624 2.1 69.05 15.1 41.3 20 50708
Arkansas South 2110 3378 1.9 70.66 10.1 39.9 65 51945
Illinois North Central 11197 5107 0.9 70.14 10.3 52.6 127 55748
Iowa North Central 2861 4628 0.5 72.56 2.3 59.0 140 55941
Indiana North Central 5313 4458 0.7 70.88 7.1 52.9 122 36097
Alaska West 365 6315 1.5 69.31 11.3 66.7 152 566432
California West 21198 5114 1.1 71.71 10.3 62.6 20 156361
Hawaii West 868 4963 1.9 73.60 6.2 61.9 0 6425
Colorado West 2541 4884 0.7 72.06 6.8 63.9 166 103766
Arizona West 2212 4530 1.8 70.55 7.8 58.1 15 113417
Idaho West 813 4119 0.6 71.87 5.3 59.5 126 82677
公式符號(hào)等
> # 公式符號(hào),指的是統(tǒng)計(jì)學(xué)符號(hào),基本的如下
> # y ~ a y是a的函數(shù)
> # y ~ a + b y是a和b的函數(shù)
> # y ~ a - b y是a的函數(shù)但排除b
>
> # 認(rèn)識(shí)長(zhǎng)格式數(shù)據(jù)(Long Format)與寬格式數(shù)據(jù)(Wide Format)
> # reshapes2擴(kuò)展包的melt()函數(shù)/dcast()函數(shù)