原本還有第四個(gè)部分,小潔老師講了另一個(gè)R包下載表達(dá)矩陣和臨床信息的,
TCGA-4.使用RTCGA包獲取數(shù)據(jù)
但是這個(gè)包有個(gè)缺點(diǎn)就是數(shù)據(jù)更新不及時(shí),因此當(dāng)時(shí)看到時(shí)候我就沒有跟學(xué)了。直接跳到第五步TCGA-5.(轉(zhuǎn)錄組)差異分析三大R包及其結(jié)果對(duì)比
但是呢,由于沒跟學(xué)第四步這一步獲取數(shù)據(jù)并做數(shù)據(jù)清洗的時(shí)候出了問題,一直沒能完成,后來昨天花了點(diǎn)時(shí)間學(xué)了冰糖在菜鳥團(tuán)的推文也是小潔老師的第四步教程相關(guān)的內(nèi)容,對(duì)比來看一步步調(diào)試,再加上從技能樹推文得到的小潔老師的畫圖函數(shù)后,終于完成了第五步的學(xué)習(xí)。
還是很有收獲的。
1.提前準(zhǔn)備安裝和加載R包
rm(list = ls())
options(stringsAsFactors = F)
if(!require(stringr))install.packages('stringr')
if(!require(ggplotify))install.packages("ggplotify")
if(!require(patchwork))install.packages("patchwork")
if(!require(cowplot))install.packages("cowplot")
if(!require(DESeq2))install.packages('DESeq2')
if(!require(edgeR))install.packages('edgeR')
if(!require(limma))install.packages('limma')
2.準(zhǔn)備數(shù)據(jù)
本示例的數(shù)據(jù)是TCGA-KIRC的表達(dá)矩陣。tcga樣本編號(hào)14-15位是隱藏分組信息的,詳見:
TCGA的樣本id里藏著分組信息
TCGA樣本id,分組信息是在這個(gè)id的第14-15位,01-09是tumor,10-29是normal。
#TCGA-KIRC
library(TCGAbiolinks)
#可以查看所有支持的癌癥種類的縮寫
#TCGAbiolinks:::getGDCprojects()$project_id
#還是選擇之前的例子
cancer_type="TCGA-KIRC"
clinical <- GDCquery_clinic(project = cancer_type, type = "clinical")
clinical[1:4,1:4]
dim(clinical)
query <- GDCquery(project = cancer_type,
data.category = "Transcriptome Profiling",
data.type = "miRNA Expression Quantification",
workflow.type = "BCGSC miRNA Profiling")
GDCdownload(query, method = "api", files.per.chunk = 50)
expdat <- GDCprepare(query = query)
expdat[1:3,1:3]
library(tibble)
rownames(expdat) <- NULL
expdat <- column_to_rownames(expdat,var = "miRNA_ID")
expdat[1:3,1:3]
exp = t(expdat[,seq(1,ncol(expdat),3)])
exp[1:4,1:4]
expr=exp
rowName <- str_split(rownames(exp),'_',simplify = T)[,3]
expr<- apply(expr,2,as.numeric)
expr<- na.omit(expr)
dim(expr)
expr <- expr[,apply(expr, 2,function(x){sum(x>1)>10})]
rownames(expr) <- rowName
dim(expr)
expr[1:4,1:4]
save(expr,clinical,file = "tcga-kirc-download.Rdata")
rm(list = ls())
load("tcga-kirc-download.Rdata") #獲取初步下載數(shù)據(jù)。
meta <- clinical
colnames(meta)
meta <- meta[,c("submitter_id","vital_status",
"days_to_death","days_to_last_follow_up",
"race",
"age_at_diagnosis",
"gender" ,
"ajcc_pathologic_stage")]
expr=t(expr)
expr[1:4,1:4]
group_list <- ifelse(as.numeric(str_sub(colnames(expr),14,15))<10,"tumor","normal")
group_list <- factor(group_list,levels = c("normal","tumor"))
table(group_list)
# normal tumor
# 71 545
save(expr,group_list,file = "tcga-kirc-raw.Rdata")
由于不知道小潔老師做了怎樣的過濾,我得到的結(jié)果不同
我覺得應(yīng)該是在mata這個(gè)代碼步驟后面選擇一個(gè)指標(biāo)過濾掉一些數(shù)據(jù)。
先放著,這個(gè)代碼在這個(gè)步驟中沒有用到。以后應(yīng)該會(huì)用到。
由于不會(huì)自己寫代碼,后面的分析基本上就是走的小潔老師教程的內(nèi)容。
3.三大R包的差異分析
#Deseq2
library(DESeq2)
colData <- data.frame(row.names =colnames(expr),
condition=group_list)
dds <- DESeqDataSetFromMatrix(
countData = expr,
colData = colData,
design = ~ condition)
#參考因子應(yīng)該是對(duì)照組 dds$condition <- relevel(dds$condition, ref = "untrt")
dds <- DESeq(dds)
# 兩兩比較
res <- results(dds, contrast = c("condition",rev(levels(group_list))))
resOrdered <- res[order(res$pvalue),] # 按照P值排序
DEG <- as.data.frame(resOrdered)
head(DEG)
# 去除NA值
DEG <- na.omit(DEG)
#添加change列標(biāo)記基因上調(diào)下調(diào)
#logFC_cutoff <- with(DEG,mean(abs(log2FoldChange)) + 2*sd(abs(log2FoldChange)) )
logFC_cutoff <- 1
DEG$change = as.factor(
ifelse(DEG$pvalue < 0.05 & abs(DEG$log2FoldChange) > logFC_cutoff,
ifelse(DEG$log2FoldChange > logFC_cutoff ,'UP','DOWN'),'NOT')
)
head(DEG)
DESeq2_DEG <- DEG
#edgeR
library(edgeR)
dge <- DGEList(counts=expr,group=group_list)
dge$samples$lib.size <- colSums(dge$counts)
dge <- calcNormFactors(dge)
design <- model.matrix(~0+group_list)
rownames(design)<-colnames(dge)
colnames(design)<-levels(group_list)
dge <- estimateGLMCommonDisp(dge,design)
dge <- estimateGLMTrendedDisp(dge, design)
dge <- estimateGLMTagwiseDisp(dge, design)
fit <- glmFit(dge, design)
fit2 <- glmLRT(fit, contrast=c(-1,1))
DEG=topTags(fit2, n=nrow(expr))
DEG=as.data.frame(DEG)
logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs(logFC)) )
logFC_cutoff <- 1
DEG$change = as.factor(
ifelse(DEG$PValue < 0.05 & abs(DEG$logFC) > logFC_cutoff,
ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT')
)
head(DEG)
table(DEG$change)
edgeR_DEG <- DEG
#limma-voom
library(limma)
design <- model.matrix(~0+group_list)
colnames(design)=levels(group_list)
rownames(design)=colnames(expr)
dge <- DGEList(counts=expr)
dge <- calcNormFactors(dge)
logCPM <- cpm(dge, log=TRUE, prior.count=3)
v <- voom(dge,design, normalize="quantile")
fit <- lmFit(v, design)
constrasts = paste(rev(levels(group_list)),collapse = "-")
cont.matrix <- makeContrasts(contrasts=constrasts,levels = design)
fit2=contrasts.fit(fit,cont.matrix)
fit2=eBayes(fit2)
DEG = topTable(fit2, coef=constrasts, n=Inf)
DEG = na.omit(DEG)
#logFC_cutoff <- with(DEG,mean(abs(logFC)) + 2*sd(abs(logFC)) )
logFC_cutoff <- 1
DEG$change = as.factor(
ifelse(DEG$P.Value < 0.05 & abs(DEG$logFC) > logFC_cutoff,
ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT')
)
head(DEG)
limma_voom_DEG <- DEG
save(DESeq2_DEG,edgeR_DEG,limma_voom_DEG,group_list,file = "DEG.Rdata")
#差異分析結(jié)果的可視化
rm(list = ls())
load("tcga-kirc-raw.Rdata")
load("DEG.Rdata")
source("3-plotfunction.R")
logFC_cutoff <- 1
expr[1:4,1:4]
dat = log(expr+1)
pca.plot = draw_pca(dat,group_list)
cg1 = rownames(DESeq2_DEG)[DESeq2_DEG$change !="NOT"]
cg2 = rownames(edgeR_DEG)[edgeR_DEG$change !="NOT"]
cg3 = rownames(limma_voom_DEG)[limma_voom_DEG$change !="NOT"]
h1 = draw_heatmap(expr[cg1,],group_list)
h2 = draw_heatmap(expr[cg2,],group_list)
h3 = draw_heatmap(expr[cg3,],group_list)
v1 = draw_volcano(test = DESeq2_DEG[,c(2,5,7)],pkg = 1)
v2 = draw_volcano(test = edgeR_DEG[,c(1,4,6)],pkg = 2)
v3 = draw_volcano(test = limma_voom_DEG[,c(1,4,7)],pkg = 3)
library(patchwork)
(h1 + h2 + h3) / (v1 + v2 + v3) +plot_layout(guides = 'collect')
#(v1 + v2 + v3) +plot_layout(guides = 'collect')
ggsave("heat_volcano.png",width = 21,height = 9)
#三大R包差異基因?qū)Ρ?# 三大R包差異基因交集
UP=function(df){
rownames(df)[df$change=="UP"]
}
DOWN=function(df){
rownames(df)[df$change=="DOWN"]
}
up = intersect(intersect(UP(DESeq2_DEG),UP(edgeR_DEG)),UP(limma_voom_DEG))
down = intersect(intersect(DOWN(DESeq2_DEG),DOWN(edgeR_DEG)),DOWN(limma_voom_DEG))
hp = draw_heatmap(expr[c(up,down),],group_list)
#上調(diào)、下調(diào)基因分別畫維恩圖
up.plot <- venn(UP(DESeq2_DEG),UP(edgeR_DEG),UP(limma_voom_DEG),
"UPgene"
)
down.plot <- venn(DOWN(DESeq2_DEG),DOWN(edgeR_DEG),DOWN(limma_voom_DEG),
"DOWNgene"
)
library(cowplot)
library(ggplotify)
up.plot = as.ggplot(as_grob(up.plot))
down.plot = as.ggplot(as_grob(down.plot))
library(patchwork)
#up.plot + down.plot
pca.plot + hp+up.plot +down.plot
ggsave("deg.png",height = 10,width = 10)
整個(gè)流程走完得到的結(jié)果如下:

