TCGAbiolinks 是一個(gè)用于TCGA數(shù)據(jù)綜合分析的R/BioConductor軟件包，能夠通過GDC Application Programming Interface (API)訪問 National Cancer Institute (NCI) Genomic Data Commons (GDC) ，來搜索、下載和準(zhǔn)備TCGA相關(guān)數(shù)據(jù)，以便在R中進(jìn)行分析。

0、鏡像和R包安裝和加載

#設(shè)置鏡像
rm(list=ls())
options("repos"="https://mirrors.ustc.edu.cn/CRAN/")
#意思是，如果沒有安裝BiocManager包，那就安裝BiocManager
if(!require("BiocManager")) install.packages("BiocManager",update = F,ask = F)
options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/")
BiocManager::install("SummarizedExperiment")
BiocManager::install("TCGAbiolinks")
library(TCGAbiolinks)
library(SummarizedExperiment)

1、TCGA數(shù)據(jù)下載

library(TCGAbiolinks)#加載包
query <- GDCquery(project = "TCGA-STAD", #腫瘤類型，##TCGAbiolinks:::getGDCprojects()$project_id#可查看
                  data.category = "Transcriptome Profiling",#數(shù)據(jù)范疇#轉(zhuǎn)錄組數(shù)據(jù)
                  data.type = "Gene Expression Quantification",#選定要下載的數(shù)據(jù)類型#gene表達(dá)數(shù)據(jù)
                  workflow.type = "HTSeq - Counts")#選定要下載RNAseq的Counts文件
#######################################################################################################
#下載rna-seq的counts數(shù)據(jù)                          
#data.type = "Gene Expression Quantification"
#下載miRNA數(shù)據(jù)                                              
#data.type = "miRNA Expression Quantification"
#下載Copy Number Variation數(shù)據(jù)                    
#data.type = "Copy Number Segment"               
#######################################################################################################
samplesDown <- getResults(query,cols=c("cases"))  
#getResults(query, rows, cols)根據(jù)指定行名或列名從query中獲取結(jié)果,此處用來獲得樣本的barcode，即，樣本名
# 此處共檢索出407個(gè)barcodes
head(samplesDown)#407樣本名
[1] "TCGA-F1-A448-01A-11R-A24K-31" "TCGA-D7-6522-01A-11R-1802-13"
[3] "TCGA-VQ-A94R-01A-11R-A414-31" "TCGA-RD-A8MV-01A-11R-A36D-31"
[5] "TCGA-BR-4191-01A-02R-1131-13" "TCGA-BR-8291-01A-11R-2343-13"
#######################################################################################################
# TCGAquery_SampleTypes(barcode, typesample) #篩選腫瘤和正常組織的barcode，方便后面分組做差異分析
# TP代表PRIMARY SOLID TUMOR；NT-代表Solid Tissue Normal（其他組織樣本可參考學(xué)習(xí)文檔）
dataSmTP <- TCGAquery_SampleTypes(barcode = samplesDown,
                                  typesample = "TP")
#View(dataSmTP )# 從samplesDown中篩選出375個(gè)TP樣本barcodes

dataSmNT <- TCGAquery_SampleTypes(barcode = samplesDown,
                                  typesample = "NT")
#View(dataSmNT)# 從samplesDown中篩選出32個(gè)正常樣本barcodes
#######################################################################################################
###設(shè)置barcodes參數(shù)，篩選符合要求的375個(gè)腫瘤樣本數(shù)據(jù)和32正常組織數(shù)據(jù)
#barcode參數(shù)：根據(jù)傳入barcode進(jìn)行數(shù)據(jù)過濾
queryDown <- GDCquery(project = "TCGA-SATD", 
                      data.category = "Transcriptome Profiling",
                      data.type = "Gene Expression Quantification", 
                      workflow.type = "HTSeq - Counts", 
                      barcode = c(dataSmTP, dataSmNT))
#######################################################################################################
GDCdownload(queryDown, #上面queryDown的結(jié)果
            method = "api",#兩種方法，api或者gdc-client
            files.per.chunk = 6)#分為多個(gè)片段下載,可以解決下載容易中斷的問題。
##下載完成，前面如果已經(jīng)下載了，這里不會(huì)再次下載

2、數(shù)據(jù)整理

########################################################################################
library(SummarizedExperiment)
#GDCprepare()將前面GDCquery（）的結(jié)果準(zhǔn)備成R語言可處理的SE（SummarizedExperiment）文件
dataPrep1 <- GDCprepare(query = queryDown, save = TRUE, save.filename =
                          "STAD_case.Rda")
count_matrix=assay(dataPrep1)#數(shù)組
write.csv(count_matrix,file = paste("TCGA-STAD","Counts.csv",sep = "-"))#文件保存
###########################################################################################
# 去除dataPrep1中的異常值，dataPrep1數(shù)據(jù)中含有腫瘤組織和正常組織的數(shù)據(jù)
##TCGAanalyze_Preprocessing()對(duì)數(shù)據(jù)進(jìn)行預(yù)處理：
#使用spearman相關(guān)系數(shù)去除數(shù)據(jù)中的異常值
#TCGAanalyze_Preprocessing(object, cor.cut = 0, filename = NULL,width = 1000,                        height = 1000, datatype = names(assays(object))[1])
# 函數(shù)功能描述：
#Array Array Intensity correlation (AAIC) and correlation boxplot to define outlier 
dataPrep2 <- TCGAanalyze_Preprocessing(object = dataPrep1,
                                      cor.cut = 0.6,
                                      datatype = "HTSeq - Counts")
#將預(yù)處理后的數(shù)據(jù)dataPrep2，寫入新文件“LIHC_dataPrep.csv”
write.csv(dataPrep2,file = "STAD_dataPrep.csv",quote = FALSE)
###########################################################################################
# TCGAtumor_purity(barcodes, estimate, absolute, lump, ihc, cpe);
#使用來自5種方法的5個(gè)估計(jì)值作為閾值對(duì)TCGA樣本進(jìn)行過濾，這5個(gè)值是estimate, absolute, lump, ihc, cpe;
#這里設(shè)置cpe=0.6（cpe是派生的共識(shí)度量，是將所有方法的標(biāo)準(zhǔn)含量歸一化后的均值純度水平，以使它們具有相等的均值和標(biāo)準(zhǔn)差）

#篩選腫瘤純度大于等于60%的樣本數(shù)據(jù)
purityDATA <- TCGAtumor_purity(colnames(dataPrep1), 0, 0, 0, 0, 0.6)
# filtered 為被過濾的數(shù)據(jù)， pure_barcodes是我們要的腫瘤數(shù)據(jù)
Purity.STAD<-purityDATA$pure_barcodes
normal.STAD<-purityDATA$filtered
#獲取腫瘤純度大于60%的375個(gè)腫瘤組織樣本+32個(gè)正常組織樣本,共計(jì)407個(gè)樣本
puried_data <-dataPrep2[,c(Purity.STAD,normal.STAD)]
###########################################################################################
#基因注釋,需要加載“SummarizedExperiment”包，“SummarizedExperiment container”每個(gè)由數(shù)字或其他模式的類似矩陣的對(duì)象表示。行通常表示感興趣的基因組范圍和列代表樣品。
library("SummarizedExperiment")
rowData(dataPrep1)  #傳入數(shù)據(jù)dataPrep1必須為 
#                SummarizedExpensembl_gene_id      external_gene_name    original_ensembl_gene_id
#                   <character>        <character>
#ENSG00000000003 ENSG00000000003             TSPAN6 ENSG00000000003.13
#ENSG00000000005 ENSG00000000005               TNMD ENSG00000000005.5
#ENSG00000000419 ENSG00000000419               DPM1 ENSG00000000419.11
#ENSG00000000457 ENSG00000000457              SCYL3 ENSG00000000457.12
#ENSG00000000460 ENSG00000000460           C1orf112 ENSG00000000460.15
#將結(jié)果寫入文件“puried.STAD.cancer.csv”
rownames(puried_data)<-rowData(dataPrep1)$external_gene_name
write.csv(puried_data,file = "puried.STAD.csv",quote = FALSE)
###########################################################################################
#進(jìn)行表達(dá)矩陣標(biāo)準(zhǔn)化和過濾，得到用于差異分析的表達(dá)矩陣
#TCGAanalyze_Normalization（）# 使用EDASeq軟件包標(biāo)準(zhǔn)化mRNA轉(zhuǎn)錄本和miRNA。
#TCGAanalyze_Normalization()執(zhí)行EDASeq包中的如下功能：
#1. EDASeq::newSeqExpressionSet
#2. EDASeq::withinLaneNormalization
#3. EDASeq::betweenLaneNormalization
#4. EDASeq::counts
dataNorm <- TCGAanalyze_Normalization(tabDF = puried_data,#RNAseq表達(dá)矩陣，行代表基因，列代表樣本|
                                      geneInfo = geneInfo,
                                      method = "gcContent")
#|geneInfo|關(guān)于geneLength和gcContent的20531個(gè)基因的矩陣，“geneInfoHT”和“geneInfo”可選。
#method |選擇標(biāo)準(zhǔn)化的方法，基于’gcContent’ 或 ’geneLength’的標(biāo)準(zhǔn)化方法可選

#將標(biāo)準(zhǔn)化后的數(shù)據(jù)再過濾，去除掉表達(dá)量較低（count較低）的基因，得到最終的數(shù)據(jù)
dataFilt <- TCGAanalyze_Filtering(tabDF = dataNorm,
                                  method = "quantile", #用于過濾較低count數(shù)的基因的方法，有’quantile’, ’varFilter’, ’filter1’, ’filter2’
                                  qnt.cut =  0.25)#過濾的閾值
str(dataFilt)
write.csv(dataFilt,file = "TCGA_STAD_final.csv",quote = FALSE)

3、基因差異表達(dá)分析

#首先讀入表達(dá)矩陣文件
dataFilt_LIHC_final <- read.csv("TCGA_STAD_final.csv", header = T,check.names = FALSE）
# 定義行名
rownames(dataFilt_LIHC_final) <- dataFilt_LIHC_final[,1]
dataFilt_LIHC_final <- dataFilt_LIHC_final[,-1]
View(dataFilt_STAD_final)
#定義樣本的分組，腫瘤組和正常組
#目前只知道根據(jù)barcode14和15位數(shù)字0—9為腫瘤，大于就為正常樣本
metadata <- data.frame(colnames(dataFilt_STAD_final))#View(metadata)
for (i in 1:length(metadata[,1])) {
  num <- as.numeric(substring(metadata[i,1],14,15))
  if (num %in% seq(1,9)) {metadata[i,2] <- "Tum"}
  if (num %in% seq(10,29)) {metadata[i,2] <- "Nor"}
}
colnames(metadata)[2] <- c("fenzu")
ifelse(metadata$fenzu%in%c("Tum")==TRUE,"Tum","Nor")#發(fā)現(xiàn)從376行開始是正常的
# 定義腫瘤樣本分組
mat1 <- dataFilt_STAD_final[,1-375]
mat1 <- log(mat1+1)
# 定義正常組織樣本分組
mat2 <- dataFilt_STAD_final[,376-407]
mat2 <- log(mat2+1)
#數(shù)據(jù)準(zhǔn)備好了，開始差異表達(dá)分析
Data_DEGs <- TCGAanalyze_DEA(mat1 = mat1,#腫瘤組織的表達(dá)矩陣
                             mat2 = mat2,#正常樣本的表達(dá)矩陣
                             Cond1type = "Tumor",#mat1中的樣品分組信息
                             Cond2type = "Normal",#mat2中的樣品分組信息
                             pipeline="limma",#用limma包還是edgeR
                             batch.factors = c("TSS"),#批處理糾正選項(xiàng)
                             voom = TRUE,
                             contrast.formula = "Mycontrast=Tumor-Normal")
View(Data_DEGs)#結(jié)果展示
######################################################################################
#設(shè)置logFC，挑選表達(dá)有差異的基因進(jìn)行富集分析
Data_DEGs_high_expr <- Data_DEGs[Data_DEGs$logFC >=1,]
Genelist <- rownames(Data_DEGs_high_expr)
ansEA <- TCGAanalyze_EAcomplete(TFname="DEA genes Normal Vs Tumor", Genelist)
#富集分析結(jié)果可視化##
TCGAvisualize_EAbarplot(tf  = rownames(ansEA$ResBP),
                        GOBPTab = ansEA$ResBP,
                        GOCCTab = ansEA$ResCC,
                        GOMFTab = ansEA$ResMF,
                        PathTab = ansEA$ResPat,
                        nRGTab = Genelist,
                        nBar = 10, #顯示條形圖的數(shù)量
                        filename = "TCGAvisualize_EAbarplot_Output.pdf")
#結(jié)果沒富集出來提示
#Error in mtext(mainLab, side = 3, line = -1, outer = TRUE, font = 2) :  plot.new has not been called yet

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av

TCGA數(shù)據(jù)下載——TCGAbiolinks

TCGA數(shù)據(jù)下載——TCGAbiolinks

0、鏡像和R包安裝和加載

1、TCGA數(shù)據(jù)下載

2、數(shù)據(jù)整理

3、基因差異表達(dá)分析

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九 欧美,1769亚洲,黄色成人av

TCGA數(shù)據(jù)下載——TCGAbiolinks

0、鏡像和R包安裝和加載

1、TCGA數(shù)據(jù)下載

2、數(shù)據(jù)整理

3、基因差異表達(dá)分析

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av

0、鏡像和R包安裝和加載

2、數(shù)據(jù)整理

3、基因差異表達(dá)分析