Ruizheng 的學(xué)習(xí)筆記

感謝生信技能樹小潔老師

stringr

rm(list = ls())
if(!require(stringr))install.packages('stringr')

## Loading required package: stringr

library(stringr)

x <- "The birch canoe slid on the smooth planks."

1.檢測(cè)字符串長(zhǎng)度

length(x)

## [1] 1

str_length(x)

## [1] 42

2.字符串拆分與組合

str_split(x," ")

## [[1]]
## [1] "The"     "birch"   "canoe"   "slid"    "on"      "the"     "smooth" 
## [8] "planks."

x2 = str_split(x," ")[[1]]
str_c(x2,collapse = " ")

## [1] "The birch canoe slid on the smooth planks."

str_c(x2,1234,sep = "+")

## [1] "The+1234"     "birch+1234"   "canoe+1234"   "slid+1234"    "on+1234"     
## [6] "the+1234"     "smooth+1234"  "planks.+1234"

3.提取字符串的一部分

str_sub(x,5,9)

## [1] "birch"

4.大小寫轉(zhuǎn)換

str_to_upper(x2)

## [1] "THE"     "BIRCH"   "CANOE"   "SLID"    "ON"      "THE"     "SMOOTH" 
## [8] "PLANKS."

str_to_lower(x2)

## [1] "the"     "birch"   "canoe"   "slid"    "on"      "the"     "smooth" 
## [8] "planks."

str_to_title(x2)

## [1] "The"     "Birch"   "Canoe"   "Slid"    "On"      "The"     "Smooth" 
## [8] "Planks."

5.字符串排序

str_sort(x2)

## [1] "birch"   "canoe"   "on"      "planks." "slid"    "smooth"  "the"    
## [8] "The"

6.字符檢測(cè)

str_detect(x2,"h")

## [1]  TRUE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE

str_starts(x2,"T")

## [1]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

str_ends(x2,"e")

## [1]  TRUE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE

與sum和mean連用，可以統(tǒng)計(jì)匹配的個(gè)數(shù)和比例

sum(str_detect(x2,"h"))

## [1] 4

mean(str_detect(x2,"h"))

## [1] 0.5

7.提取匹配到的字符串

str_subset(x2,"h")

## [1] "The"    "birch"  "the"    "smooth"

8.字符計(jì)數(shù)

str_count(x," ")

## [1] 7

str_count(x2,"o")

## [1] 0 0 1 0 1 0 2 0

9.字符串替換

str_replace(x2,"o","A")

## [1] "The"     "birch"   "canAe"   "slid"    "An"      "the"     "smAoth" 
## [8] "planks."

str_replace_all(x2,"o","A")

## [1] "The"     "birch"   "canAe"   "slid"    "An"      "the"     "smAAth" 
## [8] "planks."

結(jié)合正則表達(dá)式更加強(qiáng)大

練習(xí)6-2

#Bioinformatics is a new subject of genetic data collection,analysis and dissemination to the research community.
#1.將上面這句話作為一個(gè)長(zhǎng)字符串，賦值給tmp
tmp <- "Bioinformatics is a new subject of genetic data collection,analysis and dissemination to the research community."

#2.拆分為一個(gè)由單詞組成的向量，賦值給tmp2(注意標(biāo)點(diǎn)符號(hào))
tmp2 = tmp %>% 
  str_replace(","," ") %>%
  str_remove("[.]") %>% 
  str_split(" ")
tmp2 = tmp2[[1]]

#3.用函數(shù)返回這句話中有多少個(gè)單詞。
length(tmp2)

## [1] 16

#4.用函數(shù)返回這句話中每個(gè)單詞由多少個(gè)字母組成。
a <- str_count(tmp2);a

##  [1] 14  2  1  3  7  2  7  4 10  8  3 13  2  3  8  9

#5.統(tǒng)計(jì)tmp2有多少個(gè)單詞中含有字母"e"
sum(str_detect(tmp2,"e"))

## [1] 7

str_detect(x, “h”) - 返回等長(zhǎng)的邏輯值向量

條件語(yǔ)句

if條件語(yǔ)句：如果。。。就。。。，否則。。。

if(一個(gè)邏輯值){ 一段代碼 } else { 一段代碼 }

(1)只有if沒有else，那么條件是FALSE時(shí)就什么都不做

i = -1
if (i<0) print('up')

## [1] "up"

if (i>0) print('up')

if(!require(tidyr)) install.packages("tidyr")

## Loading required package: tidyr

(2)有else

i =1
if (i>0){
  cat('+')   # cat 之間打出來里面的內(nèi)容
} else {
  print("-") # print 加點(diǎn)東西
}

## +

ifelse 很重要

x=rnorm(10)
y=ifelse(x>0,"+","-")
y

##  [1] "+" "-" "-" "-" "-" "-" "-" "+" "+" "-"

(3)多個(gè)條件

i = 0
if (i>0){
  print('+')
} else if (i==0) {
  print('0')
} else if (i< 0){
  print('-')
}

## [1] "0"

ifelse(i>0,"+",ifelse((i<0),"-","0"))

## [1] "0"

2.switch()

cd = 3
foo <- switch(EXPR = cd, 
              #EXPR = "aa", 
              aa=c(3.4,1),
              bb=matrix(1:4,2,2),
              cc=matrix(c(T,T,F,T,F,F),3,2),
              dd="string here",
              ee=matrix(c("red","green","blue","yellow")))
foo

##       [,1]  [,2]
## [1,]  TRUE  TRUE
## [2,]  TRUE FALSE
## [3,] FALSE FALSE

dplyr::case_when() 解決無限套娃問題

1. For循環(huán)

循環(huán)中中括號(hào)建議寫兩個(gè)

順便看一下next和break

x <- c(5,6,0,3)
s=0
for (i in x){
  s=s+i
  #if(i == 0) next   跳到下一個(gè)循環(huán)
  #if (i == 0) break 跳出整個(gè)循環(huán)
  print(c(which(x==i),i,1/i,s))
}

## [1] 1.0 5.0 0.2 5.0
## [1]  2.0000000  6.0000000  0.1666667 11.0000000
## [1]   3   0 Inf  11
## [1]  4.0000000  3.0000000  0.3333333 14.0000000

x <- c(5,6,0,3)
s = 0
for (i in 1:length(x)){
  s=s+x[[i]]   ##  循環(huán)中 中括號(hào)建議寫兩個(gè)
  #if(i == 3) next
  #if (i == 3) break
  print(c(i,x[[i]],1/x[[i]],s))
}

## [1] 1.0 5.0 0.2 5.0
## [1]  2.0000000  6.0000000  0.1666667 11.0000000
## [1]   3   0 Inf  11
## [1]  4.0000000  3.0000000  0.3333333 14.0000000

如何將結(jié)果存下來?

s = 0
result = list()
for(i in 1:length(x)){
  s=s+x[[i]]
  result[[i]] = c(i,x[[i]],1/i,s)
}
do.call(cbind,result)

##      [,1] [,2]       [,3]  [,4]
## [1,]    1  2.0  3.0000000  4.00
## [2,]    5  6.0  0.0000000  3.00
## [3,]    1  0.5  0.3333333  0.25
## [4,]    5 11.0 11.0000000 14.00

練習(xí)6-3

注意最后aes()傳參的問題

為什么for循環(huán)和aes八字不合

get()函數(shù)可以讓字符變成變量名

#1.使用循環(huán)，查看"a",TRUE和3的數(shù)據(jù)類型
m <- list("a", TRUE, 3)
for (i in 1:3) {
  class(m[[i]])
}
#2.生成10個(gè)隨機(jī)數(shù)，根據(jù)這10個(gè)隨機(jī)數(shù)生成一個(gè)新向量，>中位數(shù)的值對(duì)應(yīng)"A",<中位數(shù)的值對(duì)應(yīng)"B"。
m <- rnorm(10)
m1 <- ifelse(m>median(m), "A", "B");m1

##  [1] "A" "A" "B" "A" "B" "A" "B" "B" "B" "A"

#3.根據(jù)上一練習(xí)題中的tmp2生成一個(gè)新向量，含有e的值對(duì)應(yīng)"A",不含有e的值對(duì)應(yīng)"B"
tmp = "Bioinformatics is a new subject of genetic data collection,analysis and dissemination to the research community."
library(stringr)
tmp2 = tmp %>% 
  str_replace(","," ") %>%
  str_remove("[.]") %>% 
  str_split(" ")
tmp2 = tmp2[[1]]
ifelse(str_detect(tmp2, "e"), "A", "B")

##  [1] "B" "B" "B" "A" "A" "B" "A" "B" "A" "B" "B" "A" "B" "A" "A" "B"

#2.生成一個(gè)隨機(jī)數(shù)（rnorm）組成的10行6列的矩陣，列名為sample1，sample2….sample6，
# 行名為gene1，gene2…gene10，
# 分組為sample1、2、3屬于A組，sample4、5、6屬于B組。
# 用循環(huán)對(duì)每個(gè)基因畫ggplot2箱線圖。
set.seed(2020)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(ggplot2)
library(cowplot)

## 
## ********************************************************

## Note: As of version 1.0.0, cowplot does not change the

##   default ggplot2 theme anymore. To recover the previous

##   behavior, execute:
##   theme_set(theme_cowplot())

## ********************************************************

library(patchwork)

## 
## Attaching package: 'patchwork'

## The following object is masked from 'package:cowplot':
## 
##     align_plots

exp = matrix(rnorm(60),nrow = 10)
colnames(exp) <- paste0("sample",1:6)
rownames(exp) <- paste0("gene",1:10)
exp[1:4,1:4]

##          sample1    sample2     sample3    sample4
## gene1  0.3769721 -0.8531228  2.17436525 -0.8125047
## gene2  0.3015484  0.9092592  1.09818265 -0.7437022
## gene3 -1.0980232  1.1963730  0.31822032  1.0953451
## gene4 -1.1304059 -0.3715839 -0.07314756  2.4353737

dat = data.frame(t(exp))
dat = mutate(dat,group = rep(c("A","B"),each = 3))%>% mutate(pair = rep(c("AA","BB","cc"),each = 2))

dat2 = gather(dat,key = "gene",value = "expression",-group,-pair)
ggplot(data = dat2)+
  geom_boxplot(aes(x = group,y = expression,color = group))+
  theme_bw()+
  facet_wrap(~gene,nrow = 2)

image.png

p <- list()
for (i in colnames(dat)[1:10]) {
  p[[which(colnames(dat)==i)]] = ggplot(data = dat, 
                   aes(x = group,y = !!dat[,i], color = group))+
    geom_boxplot()+
    ylab(paste0("Expression of ", i))
}

wrap_plots(p,nrow=2,guides = 'collect')

image.png

2.while 循環(huán)

i = 0

while (i < 5){
  print(c(i,i^2))
  i = i+1
}

## [1] 0 0
## [1] 1 1
## [1] 2 4
## [1] 3 9
## [1]  4 16

apply()族函數(shù)

1.apply 處理矩陣或數(shù)據(jù)框

apply(X, MARGIN, FUN, …)

其中X是數(shù)據(jù)框/矩陣名；

MARGIN為1表示取行，為2表示取列，F(xiàn)UN是函數(shù)

test<- iris[,1:4]

apply(test, 2, mean)

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##     5.843333     3.057333     3.758000     1.199333

a <- apply(test, 1, sum);a[1:3]

## [1] 10.2  9.5  9.4

res <- c()

for(i in 1:nrow(test)){
  res[[i]] <- sum(test[i,])
}
res[1:3]

## [[1]]
## [1] 10.2
## 
## [[2]]
## [1] 9.5
## 
## [[3]]
## [1] 9.4

練習(xí)：

# 1.加載test2.Rdata,求每一行的方差
# load(file = "test2.Rdata")
apply(test, 1, sd)[1:3]

## [1] 2.179449 2.036950 1.997498

# 2.加載class.Rdata,嘗試將前6列轉(zhuǎn)為數(shù)值型，得到一個(gè)新矩陣
# load(file = "class.Rdata")
# apply(y[,1:6], 2, as.numeric)

# 3.解析代碼：
names(tail(sort(apply(test,1,sd)),1000))[1:10]

## NULL

# 求test每行的方差，從小到大排序，取后1000的名字

重點(diǎn)函數(shù)

sort
match
names
ifelse 和 str_detect
identical
arrange
merge 和 inner_join
unique 和 duplicated

重點(diǎn)知識(shí)點(diǎn)

向量數(shù)據(jù)框、列表取子集
數(shù)據(jù)框新增列
文件讀取
Rdata的加載與保存
作圖保存
R包安裝和加載
形式參數(shù)、實(shí)際參數(shù)、默認(rèn)參數(shù)

R語(yǔ)言遍歷、創(chuàng)建、刪除文件夾

dir()
file.create()
file.exists(…)
file.remove()
file.rename(from, to)
file.append(file1, file2)

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九 欧美,1769亚洲,黄色成人av

DAY7 生信技能樹-數(shù)據(jù)挖掘第三期學(xué)習(xí)筆記

Ruizheng 的學(xué)習(xí)筆記

感謝 生信技能樹 小潔老師

stringr

1.檢測(cè)字符串長(zhǎng)度

2.字符串拆分與組合

3.提取字符串的一部分

4.大小寫轉(zhuǎn)換

5.字符串排序

6.字符檢測(cè)

與sum和mean連用，可以統(tǒng)計(jì)匹配的個(gè)數(shù)和比例

7.提取匹配到的字符串

8.字符計(jì)數(shù)

9.字符串替換

結(jié)合正則表達(dá)式更加強(qiáng)大

練習(xí)6-2

str_detect(x, “h”) - 返回等長(zhǎng)的邏輯值向量

條件語(yǔ)句

if條件語(yǔ)句：如果。。。就。。。，否則。。。

(1)只有if沒有else，那么條件是FALSE時(shí)就什么都不做

(2)有else

ifelse 很重要

(3)多個(gè)條件

2.switch()

dplyr::case_when() 解決無限套娃問題

1. For循環(huán)

循環(huán)中 中括號(hào)建議寫兩個(gè)

順便看一下next和break

如何將結(jié)果存下來?

練習(xí)6-3

注意最后aes()傳參的問題

get()函數(shù)可以讓字符變成變量名

2.while 循環(huán)

apply()族函數(shù)

1.apply 處理矩陣或數(shù)據(jù)框

apply(X, MARGIN, FUN, …)

其中X是數(shù)據(jù)框/矩陣名；

MARGIN為1表示取行，為2表示取列，F(xiàn)UN是函數(shù)

練習(xí)：

重點(diǎn)函數(shù)

重點(diǎn)知識(shí)點(diǎn)

R語(yǔ)言遍歷、創(chuàng)建、刪除文件夾

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av

感謝生信技能樹小潔老師

與sum和mean連用，可以統(tǒng)計(jì)匹配的個(gè)數(shù)和比例

if條件語(yǔ)句：如果。。。就。。。，否則。。。

(1)只有if沒有else，那么條件是FALSE時(shí)就什么都不做

循環(huán)中中括號(hào)建議寫兩個(gè)

MARGIN為1表示取行，為2表示取列，F(xiàn)UN是函數(shù)

R語(yǔ)言遍歷、創(chuàng)建、刪除文件夾