2019.08.22
今天學(xué)習(xí)了r語言關(guān)于GEO數(shù)據(jù)挖掘方面的知識點,做一些整理。
1.首先是相應(yīng)數(shù)據(jù)的下載與檢查
利用R包進行下載
rm(list = ls())#清空環(huán)境變量
options(stringsAsFactors = F)##字符不作為因子讀入
#####數(shù)據(jù)下載
library(GEOquery)
f='GSE5282.Rdata'
####getGPL獲得平臺的注釋信息,但下載速度會慢很多
####而且注釋文件格式大多不如bioconductor包好用
if(!file.exists(f)){
gset<-getGEO('GSE5282',destdir='.',
AnnotGPL=F,
getGPL=F)
save(gset,file=f)
}
#數(shù)據(jù)提取
load('GSE5282.Rdata')
ex= exprs(gset[[1]])
pd = pData(gset[[1]])
############代碼查看矩陣是否需要log(來自GEO2R)
qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
LogC <- (qx[5] > 100) ||
(qx[6]-qx[1] > 50 && qx[2] > 0) ||
(qx[2] > 0 && qx[2] < 1 && qx[4] > 1 && qx[4] < 2)
LogC
###一般來說如果LogC返回值為True,那么需要再做一步log2處理,此處為T,故做了處理。
##且為了防止出現(xiàn)負數(shù)的情況,一般會括號里加1.
ex = log2(ex+1)
##########boxplot可視化數(shù)據(jù)檢查
boxplot(ex,las=2,cex.axis=0.6,main='data check')
boxplot(ex,las=2,cex.axis=0.6,main='data check')
group_list=c(c(rep("EGF_4h",3)),c(rep("Control_4h",3)),c(rep("Control_12h",3)),c(rep("EGF_12h",3)))
####PCA看分組情況
library("FactoMineR")
library("factoextra")
####a data frame with n rows (individuals)
####and p columns (numeric variables)
df.pca <- PCA(t(ex), graph = FALSE)
fviz_pca_ind(df.pca,
geom.ind = "point",
col.ind = group_list,
addEllipses = TRUE,
legend.title = "Groups"
)
save(ex,pd,group_list,file='ex_pd.Rdata')
2.構(gòu)建矩陣,數(shù)據(jù)處理(DEG)
rm(list = ls()) ## 一鍵清空~
options(stringsAsFactors = F)
load('ex_pd.Rdata') ##加載步驟一保存的數(shù)據(jù)文件
###構(gòu)建實驗設(shè)計矩陣
ex=ex[,1:6] ##這里取數(shù)據(jù)的前六列(兩組)進行處理(limma包一次只能對兩組數(shù)據(jù)進行處理分析)
group_list=group_list[1:6] ##同上,group_list也取前六列的名稱
design <- model.matrix(~0+factor(group_list))
colnames(design)=levels(factor(group_list))
rownames(design)=colnames(ex)
library(limma)
####構(gòu)建比較矩陣,設(shè)置用來對比的組別
contrast.matrix<-makeContrasts(contrasts=c('EGF_4h-Control_4h'),levels = design)
###此處('EGF_4h-Control_4h') 中二者位置調(diào)換也會改變結(jié)果。。。注意!?。?######limma三部曲,只需要歸一化后的數(shù)據(jù)、實驗矩陣、比較矩陣的輸入 ?。。。。?!重要?。。。。?!
fit <- lmFit(ex,design)
fit1 <- contrasts.fit(fit, contrast.matrix)
fit2 <- eBayes(fit1)
####上面得到了p值等統(tǒng)計的結(jié)果,topTable對p值校驗,對基因排序
tT <- topTable(fit2, adjust="fdr", number=nrow(fit2))
dim(tT)
####提取出我們需要的指標(biāo)
tT <- subset(tT, select=c('adj.P.Val',"P.Value","logFC"))
save(tT,file='DEG.Rdata')
3.進行ID轉(zhuǎn)換
? 因為geo數(shù)據(jù)一般來源于基因芯片,故需要對其芯片探針號進行轉(zhuǎn)換,變成相應(yīng)的基因名稱
rm(list = ls()) ## 一鍵清空~
options(stringsAsFactors = F)
library(BiocManager)
##因為用的例子是大鼠的且芯片和rat2302有關(guān),故用對應(yīng)的rat2302包去轉(zhuǎn)換
BiocManager::install('rat2302.db')
library('rat2302.db')
load('ex_pd.Rdata')
load('DEG.Rdata')
#####
tT$probe_id <- rownames(tT)
ls("package:rat2302.db")
#####PROBEID_SYMBOL
a<- toTable(rat2302SYMBOL)
tT <- merge(tT,a,by.x='probe_id',by.y='probe_id')
#####PROBEID_ENTREZID
b<- toTable(rat2302ENTREZID)
tT <- merge(tT,b,by.x='probe_id',by.y='probe_id')
############################存在這種現(xiàn)象,多個探針對應(yīng)一個symbol####
table(!duplicated(tT$symbol))
ex<- ex[tT$probe_id,]
identical(rownames(ex),tT$probe_id)
####只是一種參考,能自圓其說即可
tT$median=apply(ex,1,median)
tT=tT[order(tT$symbol,tT$median,decreasing = T),]
dim(tT)
tT=tT[!duplicated(tT$symbol),]
save(tT,file='DEG_symbol.Rdata')
4.熱圖繪制
rm(list=ls())
load('ex_pd.Rdata')
load('DEG.Rdata')
ex <- ex[,1:6]
######pheatmap 這是取前1000個gene進行展示,可根據(jù)自己的需求進行調(diào)整
choose_gene<-rownames(tT)[1:1000]
#####用normalize后的數(shù)據(jù)進行展示
data_matrix <-ex[choose_gene,]
#####熱圖更詳細的了解http://www.itdecent.cn/p/0e5ce59fa79e
#####scale對數(shù)據(jù)處理,求得的結(jié)果為z-score,即數(shù)據(jù)與均值之間差幾個標(biāo)準(zhǔn)差
####t是對數(shù)據(jù)做轉(zhuǎn)置處理,為了適應(yīng)scale的要求
data_matrix=t(scale(t(data_matrix)))
#####查看scale處理后數(shù)據(jù)的范圍
fivenum(data_matrix)
####目的是避免出現(xiàn)極大極小值影響可視化的效果
###2,-2
data_matrix[data_matrix>1.2]=1.2
data_matrix[data_matrix< -1.2] = -1.2
library(pheatmap)
pheatmap(data_matrix)
####調(diào)整下顏色,使正負值顏色的對比會更加鮮明
require(RColorBrewer)
bk = c(seq(-1.2,1.2, length=100))
annotation_col = data.frame(Group = rep(c("EGF_4h", "Control_4h"), c(3,3)))
rownames(annotation_col)<-colnames(ex)
####annotation_col和annotation_row的格式需為數(shù)據(jù)框
####breaks參數(shù)用于值匹配顏色
####看下,color和breaks的對應(yīng),進行理解
pheatmap(data_matrix,
breaks=bk,
show_rownames = F,
annotation_col = annotation_col,
color = colorRampPalette(c("navy", "white", "firebrick3"))(100))
####可以調(diào)整的內(nèi)容有是否聚類、是否分組、是否顯示行和列的內(nèi)容等
5.火山圖繪制
rm(list = ls())
options(stringsAsFactors = F)
load(file = "DEG_symbol.Rdata")
####主要用兩個值,logFC和p.value
####可參考進行1的計算
logFC_cutoff <- with(tT,mean(abs( logFC)) + 2*sd(abs( logFC)) )
###1也可自行根據(jù)需求設(shè)置
####依據(jù)logFC和adj.P.val為火山圖的顏色分組做準(zhǔn)備
tT$change = ifelse(tT$adj.P.Val < 0.05 & abs(tT$logFC) > logFC_cutoff,
ifelse(tT$logFC > logFC_cutoff ,'UP','DOWN'),'NOT')
####記錄上調(diào)、下調(diào)基因數(shù)目,作為title的內(nèi)容
### round(x,y)取x的y位小數(shù),如ruond(1.132442,2),則結(jié)果為1.13
this_tile <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),
'\nThe number of up gene is ',nrow(tT[tT$change =='UP',]) ,
'\nThe number of down gene is ',nrow(tT[tT$change =='DOWN',]))
library(ggplot2)
g = ggplot(data=tT,
aes(x=logFC, y=-log10(P.Value), color=change)) +
geom_point(alpha=0.4, size=1.75) +
theme_set(theme_set(theme_bw(base_size=20)))+
xlab("log2 fold change") + ylab("-log10 p-value") +
ggtitle( this_tile ) +
theme(plot.title = element_text(size=15,hjust = 0.5))+
scale_colour_manual(values = c('blue','black','red'))+
annotate('text',x=tT$logFC[tT$logFC>5],
y=-log10(tT$P.Value[tT$logFC>5]),
label=tT$symbol[tT$logFC>5])
print(g)
ggsave(g,filename = 'volcano.png')
save(tT,file='DEG_change.Rdata')
6.KEGG和GO分析
rm(list = ls())
options(stringsAsFactors = F)
load('DEG_change.Rdata')
tT <- na.omit(tT)
####富集前的基因處理
library(clusterProfiler)
gene_up= tT[tT$change == 'UP','gene_id']
gene_down=tT[tT$change == 'DOWN','gene_id']
gene_diff=c(gene_up,gene_down)
gene_all = tT$gene_id
###############################KEGG富集分析#####################################################
## 1.KEGG pathway analysis
#上調(diào)、下調(diào)、差異、所有基因
#clusterProfiler作kegg富集分析:
###我們在意的是表格里的pvalue,對應(yīng)的通路或term是否顯著
kk.up<- enrichKEGG(gene = gene_up,
organism = 'rno',
universe = gene_all,
pvalueCutoff = 5)
kk.down <- enrichKEGG(gene = gene_down,
organism = 'rno',
universe = gene_all)
load('annotation.Rdata')
#可視化展示-Barplot
down_kegg <- kk.down@result
down_kegg <- down_kegg[down_kegg$pvalue<1,]
barplot(kk.down,drop=T,showCategory = 12,title = 'bar_down')
barplot(kk.up,drop=T,showCategory = 12,title='bar_up')
#可視化展示-Dotplot
dotplot(kk.down,showCategory = 12,title = 'dot_down')
dotplot(kk.up,showCategory = 12,title='dot_up')
###############################GO############################################################
#細胞組分
library(org.Rn.eg.db)
ego_CC <- enrichGO(gene = gene_diff,
OrgDb= org.Rn.eg.db,
universe = gene_all,
ont = "CC",
pAdjustMethod = "BH")
#生物過程
ego_BP <- enrichGO(gene = gene_diff,
OrgDb= org.Rn.eg.db,
ont = "BP",
pAdjustMethod = "BH")
#分子功能:
ego_MF <- enrichGO(gene = gene_diff,
OrgDb= org.Rn.eg.db,
ont = "MF",
pAdjustMethod = "BH")
#可視化展示-Barplot
barplot(ego_CC, showCategory=12,title="bar_CC")
barplot(ego_BP, showCategory=12,title="bar_BP")
barplot(ego_MF, showCategory=12,title="bar_MF")
#可視化展示-Dotplot
dotplot(ego_CC,showCategory = 12,title="dot_CC")
dotplot(ego_BP,showCategory = 12,title="dot_BP")
dotplot(ego_MF,showCategory = 12,title="dot_MF")
save(kk.up,kk.down,ego_BP,ego_CC,ego_MF,file='annotation.Rdata')