TCGAbiolinks 是一個(gè)用于TCGA數(shù)據(jù)綜合分析的R/BioConductor軟件包,能夠通過GDC Application Programming Interface (API)訪問 National Cancer Institute (NCI) Genomic Data Commons (GDC) ,來搜索、下載和準(zhǔn)備TCGA相關(guān)數(shù)據(jù),以便在R中進(jìn)行分析。
0、鏡像和R包安裝和加載
#設(shè)置鏡像
rm(list=ls())
options("repos"="https://mirrors.ustc.edu.cn/CRAN/")
#意思是,如果沒有安裝BiocManager包,那就安裝BiocManager
if(!require("BiocManager")) install.packages("BiocManager",update = F,ask = F)
options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/")
BiocManager::install("SummarizedExperiment")
BiocManager::install("TCGAbiolinks")
library(TCGAbiolinks)
library(SummarizedExperiment)
1、TCGA數(shù)據(jù)下載
library(TCGAbiolinks)#加載包
query <- GDCquery(project = "TCGA-STAD", #腫瘤類型,##TCGAbiolinks:::getGDCprojects()$project_id#可查看
data.category = "Transcriptome Profiling",#數(shù)據(jù)范疇#轉(zhuǎn)錄組數(shù)據(jù)
data.type = "Gene Expression Quantification",#選定要下載的數(shù)據(jù)類型#gene表達(dá)數(shù)據(jù)
workflow.type = "HTSeq - Counts")#選定要下載RNAseq的Counts文件
#######################################################################################################
#下載rna-seq的counts數(shù)據(jù)
#data.type = "Gene Expression Quantification"
#下載miRNA數(shù)據(jù)
#data.type = "miRNA Expression Quantification"
#下載Copy Number Variation數(shù)據(jù)
#data.type = "Copy Number Segment"
#######################################################################################################
samplesDown <- getResults(query,cols=c("cases"))
#getResults(query, rows, cols)根據(jù)指定行名或列名從query中獲取結(jié)果,此處用來獲得樣本的barcode,即,樣本名
# 此處共檢索出407個(gè)barcodes
head(samplesDown)#407樣本名
[1] "TCGA-F1-A448-01A-11R-A24K-31" "TCGA-D7-6522-01A-11R-1802-13"
[3] "TCGA-VQ-A94R-01A-11R-A414-31" "TCGA-RD-A8MV-01A-11R-A36D-31"
[5] "TCGA-BR-4191-01A-02R-1131-13" "TCGA-BR-8291-01A-11R-2343-13"
#######################################################################################################
# TCGAquery_SampleTypes(barcode, typesample) #篩選腫瘤和正常組織的barcode,方便后面分組做差異分析
# TP代表PRIMARY SOLID TUMOR;NT-代表Solid Tissue Normal(其他組織樣本可參考學(xué)習(xí)文檔)
dataSmTP <- TCGAquery_SampleTypes(barcode = samplesDown,
typesample = "TP")
#View(dataSmTP )# 從samplesDown中篩選出375個(gè)TP樣本barcodes
dataSmNT <- TCGAquery_SampleTypes(barcode = samplesDown,
typesample = "NT")
#View(dataSmNT)# 從samplesDown中篩選出32個(gè)正常樣本barcodes
#######################################################################################################
###設(shè)置barcodes參數(shù),篩選符合要求的375個(gè)腫瘤樣本數(shù)據(jù)和32正常組織數(shù)據(jù)
#barcode參數(shù):根據(jù)傳入barcode進(jìn)行數(shù)據(jù)過濾
queryDown <- GDCquery(project = "TCGA-SATD",
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - Counts",
barcode = c(dataSmTP, dataSmNT))
#######################################################################################################
GDCdownload(queryDown, #上面queryDown的結(jié)果
method = "api",#兩種方法,api或者gdc-client
files.per.chunk = 6)#分為多個(gè)片段下載,可以解決下載容易中斷的問題。
##下載完成,前面如果已經(jīng)下載了,這里不會(huì)再次下載
2、數(shù)據(jù)整理
########################################################################################
library(SummarizedExperiment)
#GDCprepare()將前面GDCquery()的結(jié)果準(zhǔn)備成R語言可處理的SE(SummarizedExperiment)文件
dataPrep1 <- GDCprepare(query = queryDown, save = TRUE, save.filename =
"STAD_case.Rda")
count_matrix=assay(dataPrep1)#數(shù)組
write.csv(count_matrix,file = paste("TCGA-STAD","Counts.csv",sep = "-"))#文件保存
###########################################################################################
# 去除dataPrep1中的異常值,dataPrep1數(shù)據(jù)中含有腫瘤組織和正常組織的數(shù)據(jù)
##TCGAanalyze_Preprocessing()對(duì)數(shù)據(jù)進(jìn)行預(yù)處理:
#使用spearman相關(guān)系數(shù)去除數(shù)據(jù)中的異常值
#TCGAanalyze_Preprocessing(object, cor.cut = 0, filename = NULL,width = 1000, height = 1000, datatype = names(assays(object))[1])
# 函數(shù)功能描述:
#Array Array Intensity correlation (AAIC) and correlation boxplot to define outlier
dataPrep2 <- TCGAanalyze_Preprocessing(object = dataPrep1,
cor.cut = 0.6,
datatype = "HTSeq - Counts")
#將預(yù)處理后的數(shù)據(jù)dataPrep2,寫入新文件“LIHC_dataPrep.csv”
write.csv(dataPrep2,file = "STAD_dataPrep.csv",quote = FALSE)
###########################################################################################
# TCGAtumor_purity(barcodes, estimate, absolute, lump, ihc, cpe);
#使用來自5種方法的5個(gè)估計(jì)值作為閾值對(duì)TCGA樣本進(jìn)行過濾,這5個(gè)值是estimate, absolute, lump, ihc, cpe;
#這里設(shè)置cpe=0.6(cpe是派生的共識(shí)度量,是將所有方法的標(biāo)準(zhǔn)含量歸一化后的均值純度水平,以使它們具有相等的均值和標(biāo)準(zhǔn)差)
#篩選腫瘤純度大于等于60%的樣本數(shù)據(jù)
purityDATA <- TCGAtumor_purity(colnames(dataPrep1), 0, 0, 0, 0, 0.6)
# filtered 為被過濾的數(shù)據(jù), pure_barcodes是我們要的腫瘤數(shù)據(jù)
Purity.STAD<-purityDATA$pure_barcodes
normal.STAD<-purityDATA$filtered
#獲取腫瘤純度大于60%的375個(gè)腫瘤組織樣本+32個(gè)正常組織樣本,共計(jì)407個(gè)樣本
puried_data <-dataPrep2[,c(Purity.STAD,normal.STAD)]
###########################################################################################
#基因注釋,需要加載“SummarizedExperiment”包,“SummarizedExperiment container”每個(gè)由數(shù)字或其他模式的類似矩陣的對(duì)象表示。行通常表示感興趣的基因組范圍和列代表樣品。
library("SummarizedExperiment")
rowData(dataPrep1) #傳入數(shù)據(jù)dataPrep1必須為
# SummarizedExpensembl_gene_id external_gene_name original_ensembl_gene_id
# <character> <character>
#ENSG00000000003 ENSG00000000003 TSPAN6 ENSG00000000003.13
#ENSG00000000005 ENSG00000000005 TNMD ENSG00000000005.5
#ENSG00000000419 ENSG00000000419 DPM1 ENSG00000000419.11
#ENSG00000000457 ENSG00000000457 SCYL3 ENSG00000000457.12
#ENSG00000000460 ENSG00000000460 C1orf112 ENSG00000000460.15
#將結(jié)果寫入文件“puried.STAD.cancer.csv”
rownames(puried_data)<-rowData(dataPrep1)$external_gene_name
write.csv(puried_data,file = "puried.STAD.csv",quote = FALSE)
###########################################################################################
#進(jìn)行表達(dá)矩陣標(biāo)準(zhǔn)化和過濾,得到用于差異分析的表達(dá)矩陣
#TCGAanalyze_Normalization()# 使用EDASeq軟件包標(biāo)準(zhǔn)化mRNA轉(zhuǎn)錄本和miRNA。
#TCGAanalyze_Normalization()執(zhí)行EDASeq包中的如下功能:
#1. EDASeq::newSeqExpressionSet
#2. EDASeq::withinLaneNormalization
#3. EDASeq::betweenLaneNormalization
#4. EDASeq::counts
dataNorm <- TCGAanalyze_Normalization(tabDF = puried_data,#RNAseq表達(dá)矩陣,行代表基因,列代表樣本|
geneInfo = geneInfo,
method = "gcContent")
#|geneInfo|關(guān)于geneLength和gcContent的20531個(gè)基因的矩陣,“geneInfoHT”和“geneInfo”可選。
#method |選擇標(biāo)準(zhǔn)化的方法,基于’gcContent’ 或 ’geneLength’的標(biāo)準(zhǔn)化方法可選
#將標(biāo)準(zhǔn)化后的數(shù)據(jù)再過濾,去除掉表達(dá)量較低(count較低)的基因,得到最終的數(shù)據(jù)
dataFilt <- TCGAanalyze_Filtering(tabDF = dataNorm,
method = "quantile", #用于過濾較低count數(shù)的基因的方法,有’quantile’, ’varFilter’, ’filter1’, ’filter2’
qnt.cut = 0.25)#過濾的閾值
str(dataFilt)
write.csv(dataFilt,file = "TCGA_STAD_final.csv",quote = FALSE)
3、基因差異表達(dá)分析
#首先讀入表達(dá)矩陣文件
dataFilt_LIHC_final <- read.csv("TCGA_STAD_final.csv", header = T,check.names = FALSE)
# 定義行名
rownames(dataFilt_LIHC_final) <- dataFilt_LIHC_final[,1]
dataFilt_LIHC_final <- dataFilt_LIHC_final[,-1]
View(dataFilt_STAD_final)
#定義樣本的分組,腫瘤組和正常組
#目前只知道根據(jù)barcode14和15位數(shù)字0—9為腫瘤,大于就為正常樣本
metadata <- data.frame(colnames(dataFilt_STAD_final))#View(metadata)
for (i in 1:length(metadata[,1])) {
num <- as.numeric(substring(metadata[i,1],14,15))
if (num %in% seq(1,9)) {metadata[i,2] <- "Tum"}
if (num %in% seq(10,29)) {metadata[i,2] <- "Nor"}
}
colnames(metadata)[2] <- c("fenzu")
ifelse(metadata$fenzu%in%c("Tum")==TRUE,"Tum","Nor")#發(fā)現(xiàn)從376行開始是正常的
# 定義腫瘤樣本分組
mat1 <- dataFilt_STAD_final[,1-375]
mat1 <- log(mat1+1)
# 定義正常組織樣本分組
mat2 <- dataFilt_STAD_final[,376-407]
mat2 <- log(mat2+1)
#數(shù)據(jù)準(zhǔn)備好了,開始差異表達(dá)分析
Data_DEGs <- TCGAanalyze_DEA(mat1 = mat1,#腫瘤組織的表達(dá)矩陣
mat2 = mat2,#正常樣本的表達(dá)矩陣
Cond1type = "Tumor",#mat1中的樣品分組信息
Cond2type = "Normal",#mat2中的樣品分組信息
pipeline="limma",#用limma包還是edgeR
batch.factors = c("TSS"),#批處理糾正選項(xiàng)
voom = TRUE,
contrast.formula = "Mycontrast=Tumor-Normal")
View(Data_DEGs)#結(jié)果展示
######################################################################################
#設(shè)置logFC,挑選表達(dá)有差異的基因進(jìn)行富集分析
Data_DEGs_high_expr <- Data_DEGs[Data_DEGs$logFC >=1,]
Genelist <- rownames(Data_DEGs_high_expr)
ansEA <- TCGAanalyze_EAcomplete(TFname="DEA genes Normal Vs Tumor", Genelist)
#富集分析結(jié)果可視化##
TCGAvisualize_EAbarplot(tf = rownames(ansEA$ResBP),
GOBPTab = ansEA$ResBP,
GOCCTab = ansEA$ResCC,
GOMFTab = ansEA$ResMF,
PathTab = ansEA$ResPat,
nRGTab = Genelist,
nBar = 10, #顯示條形圖的數(shù)量
filename = "TCGAvisualize_EAbarplot_Output.pdf")
#結(jié)果沒富集出來提示
#Error in mtext(mainLab, side = 3, line = -1, outer = TRUE, font = 2) : plot.new has not been called yet