1-1從GEO數(shù)據(jù)庫下載數(shù)據(jù)
rm(list = ls())
options(stringsAsFactors = F)
library(GEOquery)
gse = "GSE4107"
eSet <- getGEO(gse,
destdir = '.',
getGPL = F)
1-2提取表達矩陣exp
exp <- exprs(eSet[[1]])
head(exp)
exp = log2(exp+1)
dim(exp) #22個樣本,54975個探針
boxplot(exp) #對樣本的總體表達水平范圍有一個了解

截屏2020-07-18 19.35.13.png
1-3提取各樣本的分組信息
pd <- pData(eSet[[1]])
1-4調(diào)整exp的行名順序與pd列名完全一致
p = identical(rownames(pd),colnames(exp));p
if(!p) exp <- exp[,match(rownames(pd),colnames(exp))]
1-5提取芯片平臺編號,保存
gpl <- eSet[[1]]@annotation
2-1提取分組信息
rm(list = ls())
load(file = "step1output.Rdata")
library(stringr)
group_list=ifelse(str_detect(pd$title,"control"),"control","treat")
table(group_list)
group_list = factor(group_list,
levels = c("control","treat"))
2-2獲取芯片的探針注釋
gpl
#http://www.bio-info-trainee.com/1399.html
if(!require(hgu133plus2.db))BiocManager::install("hgu133plus2.db")
library(hgu133plus2.db)
ls("package:hgu133plus2.db")
ids <- toTable(hgu133plus2SYMBOL) #toTable轉(zhuǎn)換為矩陣
head(ids)
save(exp,group_list,ids,file = "step2output.Rdata")
3借助PCA和熱圖,進行數(shù)據(jù)檢查
3-1PCA
rm(list = ls())
load(file = "step1output.Rdata")
load(file = "step2output.Rdata")
dat=as.data.frame(t(exp)) #轉(zhuǎn)換為行是樣本,列是基因
library(FactoMineR)#畫主成分分析圖需要加載這兩個包
library(factoextra)
# pca
dat.pca <- PCA(dat, graph = FALSE)
pca_plot <- fviz_pca_ind(dat.pca,
geom.ind = "point", # show points only (nbut not "text")
col.ind = group_list, # color by groups
#palette = c("#00AFBB", "#E7B800"), #修改顏色
addEllipses = TRUE, # Concentration ellipses
legend.title = "Groups"
)
pca_plot
ggsave(plot = pca_plot,filename = paste0(gse,"PCA.png"))
save(pca_plot,file = "pca_plot.Rdata")

GSE4107PCA.png
3-2熱圖
cg=names(tail(sort(apply(exp,1,sd)),1000)) #對exp按行計算每個探針的標準差,從小到大排序,取標準差最大的前1000個探針,并提取對應的探針名
n=exp[cg,]
#繪制熱圖
annotation_col=data.frame(group=group_list)
rownames(annotation_col)=colnames(n)
library(pheatmap)
pheatmap(n,
show_colnames =F,
show_rownames = F,
annotation_col=annotation_col,
scale = "row")
dev.off()
4差異分析
4-1差異基因分析
rm(list = ls())
load(file = "step2output.Rdata")
library(limma)
design=model.matrix(~group_list)
fit=lmFit(exp,design)
fit=eBayes(fit)
deg=topTable(fit,coef=2,number = Inf)
4-2加probe_id列,把行名變成一列
library(dplyr)
deg <- mutate(deg,probe_id=rownames(deg));head(deg)
4-3加symbol列,去重復
deg <- inner_join(deg,ids,by="probe_id");head(deg)
deg <- deg[!duplicated(deg$symbol),]
4-4標記上下調(diào)基因
logFC_t=1 #變化超過2倍的視為差異基因
P.Value_t = 0.01 #P值小于等于0.01視為顯著
k1 = (deg$P.Value < P.Value_t)&(deg$logFC < -logFC_t)
k2 = (deg$P.Value < P.Value_t)&(deg$logFC > logFC_t)
change = ifelse(k1,"down",ifelse(k2,"up","stable"))
deg <- mutate(deg,change);head(deg)
4-5加ENTREZID列,用于富集分析
library(ggplot2)
library(clusterProfiler)
library(org.Hs.eg.db)
s2e <- bitr(deg$symbol,
fromType = "SYMBOL",
toType = "ENTREZID",
OrgDb = org.Hs.eg.db)#人類
deg <- inner_join(deg,s2e,by=c("symbol"="SYMBOL"));head(deg)
save(group_list,deg,logFC_t,P.Value_t,file = "step4output.Rdata")
5畫火山圖
rm(list = ls())
load(file = "step1output.Rdata")
load(file = "step4output.Rdata")
library(dplyr)
library(ggplot2)
dat = deg
for_label <- dat%>%
filter(symbol %in% c("TRPM3","SFRP1"))
p <- ggplot(data = dat,
aes(x = logFC,
y = -log10(P.Value))) +
geom_point(alpha=0.4, size=3.5,
aes(color=change)) +
ylab("-log10(Pvalue)")+
scale_color_manual(values=c("blue", "grey","red"))+
geom_vline(xintercept=c(-logFC_t,logFC_t),lty=4,col="black",lwd=0.8) +
geom_hline(yintercept = -log10(P.Value_t),lty=4,col="black",lwd=0.8) +
theme_bw()
p
volcano_plot <- p +
geom_point(size = 3, shape = 1, data = for_label) +
ggrepel::geom_label_repel(
aes(label = symbol),
data = for_label,
color="black"
)
volcano_plot
ggsave(plot = volcano_plot,filename = paste0(gse,"volcano.png"))

GSE4107volcano.png
6畫熱圖
load(file = 'step2output.Rdata')
x=deg$logFC
names(x)=deg$probe_id
cg = deg$probe_id[deg$change !="stable"]
n=exp[cg,]
dim(n)
library(pheatmap)
annotation_col=data.frame(group=group_list)
rownames(annotation_col)=colnames(n)
library(ggplotify)
heatmap_plot <- as.ggplot(pheatmap(n,show_colnames =F,
show_rownames = F,
scale = "row",
#cluster_cols = F,
annotation_col=annotation_col))
heatmap_plot
ggsave(heatmap_plot,filename = paste0(gse,"heatmap.png"))

GSE4107heatmap.png

test.png