R,筆記04

數(shù)據(jù)處理

> # 抽取數(shù)據(jù)去重復(fù)
> de_dup <- function()
+ {
+   i <- which(duplicated(iris))
+   x <- iris[-i, ]
+   #print(x)
+ }
> head(de_dup())
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1          5.1         3.5          1.4         0.2  setosa
2          4.9         3.0          1.4         0.2  setosa
3          4.7         3.2          1.3         0.2  setosa
4          4.6         3.1          1.5         0.2  setosa
5          5.0         3.6          1.4         0.2  setosa
6          5.4         3.9          1.7         0.4  setosa
#或者
iris[!duplicated(iris), ]

去掉NA

> head(airquality[complete.cases(airquality), ])
  Ozone Solar.R Wind Temp Month Day
1    41     190  7.4   67     5   1
2    36     118  8.0   72     5   2
3    12     149 12.6   74     5   3
4    18     313 11.5   62     5   4
7    23     299  8.6   65     5   7
8    19      99 13.8   59     5   8
> # 或者na.omit(airquality)

with() identical() within()函數(shù)

> # 用with()函數(shù)計(jì)算鳶尾花,花萼與花瓣的長(zhǎng)度比
> rwith <- with(iris, Sepal.Length / Petal.Length)
> head(rwith)
[1] 3.642857 3.500000 3.615385 3.066667 3.571429 3.176471
> 
> # identical()基本作用是檢測(cè)兩個(gè)對(duì)象是否完全相同,相同返回T,否則,F(xiàn)
> 
> # within函數(shù)與with類似,但主要用于列運(yùn)算,將運(yùn)算結(jié)果放入新列
> myiris <- iris # 不破壞內(nèi)建數(shù)據(jù)集
> myiris <- within(myiris, lenth.ratio <- Sepal.Length / Petal.Length)
> head(myiris)
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species lenth.ratio
1          5.1         3.5          1.4         0.2  setosa    3.642857
2          4.9         3.0          1.4         0.2  setosa    3.500000
3          4.7         3.2          1.3         0.2  setosa    3.615385
4          4.6         3.1          1.5         0.2  setosa    3.066667
5          5.0         3.6          1.4         0.2  setosa    3.571429
6          5.4         3.9          1.7         0.4  setosa    3.176471

分割數(shù)據(jù)

> # 分割數(shù)據(jù)
> # cut()將數(shù)據(jù)等量切割,處理后的數(shù)據(jù)是factor數(shù)據(jù)型態(tài)
> # 將state.77對(duì)象依人口數(shù)做分割,分成5等份
> popu <- state.x77[, "Population"]
> cutpopu <- cut(popu, 5)
> head(cutpopu)
[1] (344,4.53e+03]     (344,4.53e+03]     (344,4.53e+03]     (344,4.53e+03]     (1.7e+04,2.12e+04] (344,4.53e+03]    
Levels: (344,4.53e+03] (4.53e+03,8.7e+03] (8.7e+03,1.29e+04] (1.29e+04,1.7e+04] (1.7e+04,2.12e+04]

> #分割時(shí),按人口數(shù)由多到少,分別給予名稱"high" "2nd" "3rd" "4th" "low"
> cut(popu, 5, labels =  c ("high", "2nd", "3rd", "4th", "low"))
 [1] high high high high low  high high high 2nd  2nd  high high 3rd  2nd  high high high high high high 2nd  3rd  high high 2nd  high high high high 2nd  high low 
[33] 2nd  high 3rd  high high 3rd  high high high high 3rd  high high 2nd  high high 2nd  high
Levels: high 2nd 3rd 4th low
> 
> #要了解每一人口數(shù)分類有多少州
> x.popu <- cut(popu, 5, labels =  c ("high", "2nd", "3rd", "4th", "low"))
> table(x.popu)
x.popu
high  2nd  3rd  4th  low 
  34    9    5    0    2 

合并數(shù)據(jù)

準(zhǔn)備數(shù)據(jù)庫(kù)
> mystates.x77 <- as.data.frame(state.x77)
> mystates.x77$name <- rownames(state.x77) # 給新數(shù)據(jù)增加一個(gè)字段name
> head(mystates.x77)
           Population Income Illiteracy Life Exp Murder HS Grad Frost   Area       name
Alabama          3615   3624        2.1    69.05   15.1    41.3    20  50708    Alabama
Alaska            365   6315        1.5    69.31   11.3    66.7   152 566432     Alaska
Arizona          2212   4530        1.8    70.55    7.8    58.1    15 113417    Arizona
Arkansas         2110   3378        1.9    70.66   10.1    39.9    65  51945   Arkansas
California      21198   5114        1.1    71.71   10.3    62.6    20 156361 California
Colorado         2541   4884        0.7    72.06    6.8    63.9   166 103766   Colorado
> row.names(mystates.x77) <- NULL # 刪除原來行名
> head(mystates.x77)
  Population Income Illiteracy Life Exp Murder HS Grad Frost   Area       name
1       3615   3624        2.1    69.05   15.1    41.3    20  50708    Alabama
2        365   6315        1.5    69.31   11.3    66.7   152 566432     Alaska
3       2212   4530        1.8    70.55    7.8    58.1    15 113417    Arizona
4       2110   3378        1.9    70.66   10.1    39.9    65  51945   Arkansas
5      21198   5114        1.1    71.71   10.3    62.6    20 156361 California
6       2541   4884        0.7    72.06    6.8    63.9   166 103766   Colorado

> #人口大于500萬(wàn)的選出來(原單位是千人數(shù)),同時(shí)新對(duì)象要有2個(gè)字段name 和 population
> mypopu.states <- mystates.x77[mystates.x77$Population > 5000, c("name", "Population")]
> mypopu.states
             name Population
5      California      21198
9         Florida       8277
13       Illinois      11197
14        Indiana       5313
21  Massachusetts       5814
22       Michigan       9111
30     New Jersey       7333
32       New York      18076
33 North Carolina       5441
35           Ohio      10735
38   Pennsylvania      11860
43          Texas      12237

> #選出月收入大于5000美元的。同時(shí)新對(duì)象要有2個(gè)字段name 和 Income
> myincomes.states <- mystates.x77[mystates.x77$Income > 5000, c("name", "Income")]
> myincomes.states
           name Income
2        Alaska   6315
5    California   5114
7   Connecticut   5348
13     Illinois   5107
20     Maryland   5299
28       Nevada   5149
30   New Jersey   5237
34 North Dakota   5087

merge

> # merge()交集合并。merge(x, y, all = F),默認(rèn)是交接合并
> # 合并上述兩個(gè)數(shù)據(jù)中人數(shù)超500萬(wàn)的州和月收入超5000美元的州
> merge(mypopu.states, myincomes.states)
        name Population Income
1 California      21198   5114
2   Illinois      11197   5107
3 New Jersey       7333   5237
> 
> # 取并集
> merge(mypopu.states, myincomes.states, all = T)
             name Population Income
1          Alaska         NA   6315
2      California      21198   5114
3     Connecticut         NA   5348
4         Florida       8277     NA
5        Illinois      11197   5107
6         Indiana       5313     NA
7        Maryland         NA   5299
8   Massachusetts       5814     NA
9        Michigan       9111     NA
10         Nevada         NA   5149
11     New Jersey       7333   5237
12       New York      18076     NA
13 North Carolina       5441     NA
14   North Dakota         NA   5087
15           Ohio      10735     NA
16   Pennsylvania      11860     NA
17          Texas      12237     NA
> 
> # merge參數(shù)all.x = T, 保證第一個(gè)對(duì)象的元素在合并中都存在,第二個(gè)如沒有則NA填充
> merge(mypopu.states, myincomes.states, all.x = T)
             name Population Income
1      California      21198   5114
2         Florida       8277     NA
3        Illinois      11197   5107
4         Indiana       5313     NA
5   Massachusetts       5814     NA
6        Michigan       9111     NA
7      New Jersey       7333   5237
8        New York      18076     NA
9  North Carolina       5441     NA
10           Ohio      10735     NA
11   Pennsylvania      11860     NA
12          Texas      12237     NA

match

> # match()類似于取兩個(gè)對(duì)象交集,即第一對(duì)象x的某行數(shù)據(jù)若在第二個(gè)對(duì)象y中找到符合條件的數(shù)據(jù),則返回第二個(gè)對(duì)象中
> # 相應(yīng)數(shù)據(jù)的位置,否則返回NA。所以match后會(huì)返回一個(gè)與第一個(gè)對(duì)象長(zhǎng)度相同的向量。
> 
> # 找出符合人口數(shù)多于500萬(wàn),同時(shí)月授予超5000美元的行數(shù)據(jù),在對(duì)象myincomes.states中的位置,返回的向量數(shù)值即是要的結(jié)果。
> my.index <- match(mypopu.states$name, myincomes.states$name)
> my.index
 [1]  2 NA  4 NA NA NA  7 NA NA NA NA NA
> 
> # 提取出myincome.states中人口數(shù)多于500萬(wàn),同時(shí)月收入超5000美元的州的數(shù)據(jù)。
> myincomes.states[na.omit(my.index), ]
         name Income
5  California   5114
13   Illinois   5107
30 New Jersey   5237
> 
> # %in%將返回于第一個(gè)對(duì)象長(zhǎng)度相同的邏輯向量,在向量中為T的元素是我們要的數(shù)據(jù)
> my.index2 <- mypopu.states$name %in% myincomes.states$name
> my.index2
 [1]  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
> 
> #抽出mypopu.states中人口多于500萬(wàn),同時(shí)月收入過5000美元的州數(shù)據(jù)
> mypopu.states[my.index2, ]
         name Population
5  California      21198
13   Illinois      11197
30 New Jersey       7333
> 
> # 換種做法
> my.index <- match(mypopu.states$name, myincomes.states$name)
> my.index3 <- !is.na(my.index) #my.index中不是NA的賦值給my.index3
> my.index3
 [1]  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
> mypopu.states[my.index3, ]
         name Population
5  California      21198
13   Illinois      11197
30 New Jersey       7333

排序

> # 排序sort/order
> # 數(shù)據(jù)框的排序,對(duì)state.info數(shù)據(jù)框依據(jù)Income字段執(zhí)行升序排列。
> mystate.info <- data.frame(Region = state.region, state.x77)
> mystate.info
                      Region Population Income Illiteracy Life.Exp Murder HS.Grad Frost   Area
Alabama                South       3615   3624        2.1    69.05   15.1    41.3    20  50708
Alaska                  West        365   6315        1.5    69.31   11.3    66.7   152 566432
Arizona                 West       2212   4530        1.8    70.55    7.8    58.1    15 113417
Arkansas               South       2110   3378        1.9    70.66   10.1    39.9    65  51945
California              West      21198   5114        1.1    71.71   10.3    62.6    20 156361
Colorado                West       2541   4884        0.7    72.06    6.8    63.9   166 103766
Connecticut        Northeast       3100   5348        1.1    72.48    3.1    56.0   139   4862
Delaware               South        579   4809        0.9    70.06    6.2    54.6   103   1982
Florida                South       8277   4815        1.3    70.66   10.7    52.6    11  54090
Georgia                South       4931   4091        2.0    68.54   13.9    40.6    60  58073
Hawaii                  West        868   4963        1.9    73.60    6.2    61.9     0   6425
Idaho                   West        813   4119        0.6    71.87    5.3    59.5   126  82677
Illinois       North Central      11197   5107        0.9    70.14   10.3    52.6   127  55748
Indiana        North Central       5313   4458        0.7    70.88    7.1    52.9   122  36097
Iowa           North Central       2861   4628        0.5    72.56    2.3    59.0   140  55941
Kansas         North Central       2280   4669        0.6    72.58    4.5    59.9   114  81787
Kentucky               South       3387   3712        1.6    70.10   10.6    38.5    95  39650
Louisiana              South       3806   3545        2.8    68.76   13.2    42.2    12  44930
Maine              Northeast       1058   3694        0.7    70.39    2.7    54.7   161  30920
Maryland               South       4122   5299        0.9    70.22    8.5    52.3   101   9891
Massachusetts      Northeast       5814   4755        1.1    71.83    3.3    58.5   103   7826
Michigan       North Central       9111   4751        0.9    70.63   11.1    52.8   125  56817
Minnesota      North Central       3921   4675        0.6    72.96    2.3    57.6   160  79289
Mississippi            South       2341   3098        2.4    68.09   12.5    41.0    50  47296
Missouri       North Central       4767   4254        0.8    70.69    9.3    48.8   108  68995
Montana                 West        746   4347        0.6    70.56    5.0    59.2   155 145587
Nebraska       North Central       1544   4508        0.6    72.60    2.9    59.3   139  76483
Nevada                  West        590   5149        0.5    69.03   11.5    65.2   188 109889
New Hampshire      Northeast        812   4281        0.7    71.23    3.3    57.6   174   9027
New Jersey         Northeast       7333   5237        1.1    70.93    5.2    52.5   115   7521
New Mexico              West       1144   3601        2.2    70.32    9.7    55.2   120 121412
New York           Northeast      18076   4903        1.4    70.55   10.9    52.7    82  47831
North Carolina         South       5441   3875        1.8    69.21   11.1    38.5    80  48798
North Dakota   North Central        637   5087        0.8    72.78    1.4    50.3   186  69273
Ohio           North Central      10735   4561        0.8    70.82    7.4    53.2   124  40975
Oklahoma               South       2715   3983        1.1    71.42    6.4    51.6    82  68782
Oregon                  West       2284   4660        0.6    72.13    4.2    60.0    44  96184
Pennsylvania       Northeast      11860   4449        1.0    70.43    6.1    50.2   126  44966
Rhode Island       Northeast        931   4558        1.3    71.90    2.4    46.4   127   1049
South Carolina         South       2816   3635        2.3    67.96   11.6    37.8    65  30225
South Dakota   North Central        681   4167        0.5    72.08    1.7    53.3   172  75955
Tennessee              South       4173   3821        1.7    70.11   11.0    41.8    70  41328
Texas                  South      12237   4188        2.2    70.90   12.2    47.4    35 262134
Utah                    West       1203   4022        0.6    72.90    4.5    67.3   137  82096
Vermont            Northeast        472   3907        0.6    71.64    5.5    57.1   168   9267
Virginia               South       4981   4701        1.4    70.08    9.5    47.8    85  39780
Washington              West       3559   4864        0.6    71.72    4.3    63.5    32  66570
West Virginia          South       1799   3617        1.4    69.48    6.7    41.6   100  24070
Wisconsin      North Central       4589   4468        0.7    72.48    3.0    54.5   149  54464
Wyoming                 West        376   4566        0.6    70.29    6.9    62.9   173  97203
> head(mystate.info)
           Region Population Income Illiteracy Life.Exp Murder HS.Grad Frost   Area
Alabama     South       3615   3624        2.1    69.05   15.1    41.3    20  50708
Alaska       West        365   6315        1.5    69.31   11.3    66.7   152 566432
Arizona      West       2212   4530        1.8    70.55    7.8    58.1    15 113417
Arkansas    South       2110   3378        1.9    70.66   10.1    39.9    65  51945
California   West      21198   5114        1.1    71.71   10.3    62.6    20 156361
Colorado     West       2541   4884        0.7    72.06    6.8    63.9   166 103766
> state.info <- mystate.info[1:15, ]
> inc.order <- order(state.info$Income) # 默認(rèn)升序
> state.info[inc.order, ]
                   Region Population Income Illiteracy Life.Exp Murder HS.Grad Frost   Area
Arkansas            South       2110   3378        1.9    70.66   10.1    39.9    65  51945
Alabama             South       3615   3624        2.1    69.05   15.1    41.3    20  50708
Georgia             South       4931   4091        2.0    68.54   13.9    40.6    60  58073
Idaho                West        813   4119        0.6    71.87    5.3    59.5   126  82677
Indiana     North Central       5313   4458        0.7    70.88    7.1    52.9   122  36097
Arizona              West       2212   4530        1.8    70.55    7.8    58.1    15 113417
Iowa        North Central       2861   4628        0.5    72.56    2.3    59.0   140  55941
Delaware            South        579   4809        0.9    70.06    6.2    54.6   103   1982
Florida             South       8277   4815        1.3    70.66   10.7    52.6    11  54090
Colorado             West       2541   4884        0.7    72.06    6.8    63.9   166 103766
Hawaii               West        868   4963        1.9    73.60    6.2    61.9     0   6425
Illinois    North Central      11197   5107        0.9    70.14   10.3    52.6   127  55748
California           West      21198   5114        1.1    71.71   10.3    62.6    20 156361
Connecticut     Northeast       3100   5348        1.1    72.48    3.1    56.0   139   4862
Alaska               West        365   6315        1.5    69.31   11.3    66.7   152 566432
> 
> # 排序是增加次要鍵值,格式,order(主要健值,次要鍵值,……)
> # 以state.info 數(shù)據(jù)框?yàn)槔瑢egion作為主要健值,Income作為次要健值,升序排。
> inc.order2 <- order(state.info$Region, state.info$Income)
> state.info[inc.order2, ]
                   Region Population Income Illiteracy Life.Exp Murder HS.Grad Frost   Area
Connecticut     Northeast       3100   5348        1.1    72.48    3.1    56.0   139   4862
Arkansas            South       2110   3378        1.9    70.66   10.1    39.9    65  51945
Alabama             South       3615   3624        2.1    69.05   15.1    41.3    20  50708
Georgia             South       4931   4091        2.0    68.54   13.9    40.6    60  58073
Delaware            South        579   4809        0.9    70.06    6.2    54.6   103   1982
Florida             South       8277   4815        1.3    70.66   10.7    52.6    11  54090
Indiana     North Central       5313   4458        0.7    70.88    7.1    52.9   122  36097
Iowa        North Central       2861   4628        0.5    72.56    2.3    59.0   140  55941
Illinois    North Central      11197   5107        0.9    70.14   10.3    52.6   127  55748
Idaho                West        813   4119        0.6    71.87    5.3    59.5   126  82677
Arizona              West       2212   4530        1.8    70.55    7.8    58.1    15 113417
Colorado             West       2541   4884        0.7    72.06    6.8    63.9   166 103766
Hawaii               West        868   4963        1.9    73.60    6.2    61.9     0   6425
California           West      21198   5114        1.1    71.71   10.3    62.6    20 156361
Alaska               West        365   6315        1.5    69.31   11.3    66.7   152 566432
> # 在排序結(jié)果中south在northeast和north central之間,錯(cuò)了嗎?這是由于state.region是一個(gè)因子,class()可知。
> # 對(duì)因子而言order的排序,相當(dāng)于是執(zhí)行l(wèi)evels排序,所以應(yīng)該小心。
> 
> # 混合排序。部分字段升序排,部分字段降序排,用xtfrm(),可將原向量轉(zhuǎn)為數(shù)值向量,當(dāng)想要以不同方式排序時(shí),在xtfrm()前加上—即可
> 
> #以state.info為例,將Region作為主要健值升序排,Income作次要健值降序排。
> mix.order <- order(state.info$Region, -xtfrm(state.info$Income))
> state.info[mix.order, ]
                   Region Population Income Illiteracy Life.Exp Murder HS.Grad Frost   Area
Connecticut     Northeast       3100   5348        1.1    72.48    3.1    56.0   139   4862
Florida             South       8277   4815        1.3    70.66   10.7    52.6    11  54090
Delaware            South        579   4809        0.9    70.06    6.2    54.6   103   1982
Georgia             South       4931   4091        2.0    68.54   13.9    40.6    60  58073
Alabama             South       3615   3624        2.1    69.05   15.1    41.3    20  50708
Arkansas            South       2110   3378        1.9    70.66   10.1    39.9    65  51945
Illinois    North Central      11197   5107        0.9    70.14   10.3    52.6   127  55748
Iowa        North Central       2861   4628        0.5    72.56    2.3    59.0   140  55941
Indiana     North Central       5313   4458        0.7    70.88    7.1    52.9   122  36097
Alaska               West        365   6315        1.5    69.31   11.3    66.7   152 566432
California           West      21198   5114        1.1    71.71   10.3    62.6    20 156361
Hawaii               West        868   4963        1.9    73.60    6.2    61.9     0   6425
Colorado             West       2541   4884        0.7    72.06    6.8    63.9   166 103766
Arizona              West       2212   4530        1.8    70.55    7.8    58.1    15 113417
Idaho                West        813   4119        0.6    71.87    5.3    59.5   126  82677

公式符號(hào)等

> # 公式符號(hào),指的是統(tǒng)計(jì)學(xué)符號(hào),基本的如下
> # y ~ a y是a的函數(shù)
> # y ~ a + b y是a和b的函數(shù)
> # y ~ a - b y是a的函數(shù)但排除b
> 
> # 認(rèn)識(shí)長(zhǎng)格式數(shù)據(jù)(Long Format)與寬格式數(shù)據(jù)(Wide Format)
> # reshapes2擴(kuò)展包的melt()函數(shù)/dcast()函數(shù)

?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請(qǐng)結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡(jiǎn)書系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

  • 我們會(huì)根據(jù)業(yè)務(wù)的要求做各種復(fù)雜的報(bào)表,包括了分組、排序、過濾、轉(zhuǎn)置、差分、填充、移動(dòng)、合并、分裂、分布、去重、找重...
    fhhhfssfhh閱讀 1,644評(píng)論 0 6
  • 背景 一年多以前我在知乎上答了有關(guān)LeetCode的問題, 分享了一些自己做題目的經(jīng)驗(yàn)。 張土汪:刷leetcod...
    土汪閱讀 12,923評(píng)論 0 33
  • 在挖掘分析的過程當(dāng)中對(duì)字符串的處理是極為重要的,且出現(xiàn)也較為頻繁,R語(yǔ)言作為當(dāng)前最為流行的開源數(shù)據(jù)分析和可視化平臺(tái)...
    果果哥哥BBQ閱讀 6,156評(píng)論 0 8
  • 1、 2、寫一個(gè)函數(shù)trim(str),去除字符串兩邊的空白字符 3、 寫一個(gè)函數(shù)isEmail(str),判斷用...
    冰灘波紋閱讀 304評(píng)論 0 0
  • 最近遇到一個(gè)問題,在VSCode的文件側(cè)邊欄右鍵刪除文件的時(shí)候,會(huì)出個(gè)窗口提示,如下圖 經(jīng)測(cè)試,在linux系統(tǒng)中...
    浪費(fèi)了昨天閱讀 21,456評(píng)論 1 3

友情鏈接更多精彩內(nèi)容