下載TCGA FPKM數據
rm(list = ls())
# 安裝加載需要的R包
# install.packages("pacman", repos = 'https://mirror.lzu.edu.cn/CRAN/')
library(pacman)
p_load(TCGAbiolinks, tidyverse, magrittr, data.table, biomaRt)
library(dplyr)
# 設置工作目錄
projectPath = "C:/Users/81494/Desktop/changai fpkm" # 替換為真實的工作目錄
dataPath = paste(projectPath, "Data", sep = "/")
if(!dir.exists(dataPath)) dir.create(dataPath) # 新建一個Data文件夾,用于存放從TCGA上下載的數據
setwd(dataPath)
library(dplyr)
#### 使用TCGAbiolinks下載Counts和FPKM-UQ數據 ----
#https://www.bioconductor.org/packages/release/bioc/html/TCGAbiolinks.html
for(data_type in c("HTSeq - Counts", "HTSeq - FPKM")){
repeat{
query = try(GDCquery(project = "TCGA-COAD", legacy = FALSE, experimental.strategy = "RNA-Seq", data.category = "Transcriptome Profiling", data.type = "Gene Expression Quantification", workflow.type = data_type), silent = TRUE)
if(class(query) != "try-error") {
break
}else{
print("Try to connect to GDC...")
}
}
GDCdownload(query, files.per.chunk = 100) # 分批下載
# 數據預處理
dataAssy = GDCprepare(query, summarizedExperiment = F)
# # 保存變量,以備后用
# filename = paste0("TCGA-LIHC.RNA.query.", str_match(data_type, "- ([^\\s]*)$")[,2], ".RData")
# save(query, dataAssy, file = filename)
# load(filename)
# 數據整理
exp_data = dataAssy %>%
filter(str_detect(X1, "^ENSG")) %>% # 提取ENSG*注釋的基因
mutate(X1 = gsub("\\..*", "", X1)) %>% # 刪除版本號
rename(ensembl_gene_id = X1) %>% # "X1"重命名為"ensembl_gene_id"
set_colnames(str_replace(colnames(.), "(-\\w*){3}$", "")) # 修改TCGA樣本名
# 上兩行等同于:
# set_colnames(c("ensembl_gene_id", str_match(colnames(.)[-1], "(TCGA-[^-]*-[^-]*-[^-]*)")[,2])) # 修改列名
exp_data[1:3,1:3]
# 輸出
outfile = paste0("COAD_Portal_RNA_", str_match(data_type, "- ([^\\s]*)$")[,2], ".txt")
fwrite(exp_data, outfile, row.names = F, sep = "\t", quote = F)
下載TCGA臨床數據
XENA官網下載整合好的數據
p_load(data.table, tidyverse, magrittr)
clinic_data = read.table("COAD_clinicalMatrix", header = T, sep = "\t")
surv_data = read.table("COAD_survival.txt", header = T, sep = "\t")
clinic_data = merge(clinic_data, surv_data, by.x = "sampleID", by.y = "sample")
rownames(clinic_data) <- clinic_data$sampleID
clinic_data=clinic_data[,c('sampleID','CDE_ID_3226963','MSI_updated_Oct62011','loss_expression_of_mismatch_repair_proteins_by_ihc','loss_expression_of_mismatch_repair_proteins_by_ihc_result','microsatellite_instability','pathologic_M','pathologic_N','pathologic_T','pathologic_stage','OS','OS.time','DSS','DSS.time','PFI','PFI.time')]
整理表達矩陣
exp_FPKM = fread("COAD_Portal_RNA_FPKM.txt", h = T, sep = "\t", check.names = F) %>% column_to_rownames("ensembl_gene_id")
table(!duplicated(colnames(exp_FPKM)))
exp_FPKM = exp_FPKM[,!duplicated(colnames(exp_FPKM))]#去重復測序兩次的標本
exp_FPKM_T=exp_FPKM[,str_sub(colnames(exp_FPKM),14,16)=="01A"]#腫瘤樣本
colnames(exp_FPKM_T) <- str_sub(colnames(exp_FPKM_T),1,15)
dd1 <- exp_FPKM_T[1:20,1:20]
提取mRNA和gene名轉換
load("anno.Rdata")
#step4:表達矩陣拆分和注釋
mRNA_anno <- mRNA_anno %>%
tidyr::separate(gene_id,into = c("gene_id","drop"),sep="\\.")
sum(rownames(exp_FPKM_T) %in% mRNA_anno$gene_id)
mRNA_exp = exp_FPKM_T[rownames(exp_FPKM_T) %in% mRNA_anno$gene_id,]
tmp = data.frame(gene_id = rownames(exp_FPKM_T))
x = dplyr::inner_join(tmp,mRNA_anno,by = "gene_id")
#inner_join不改變順序,可以確認一下
table(!duplicated(x$gene_name))
#行名不允許重復,因此一個ensambelid對應多個symbol的需要去掉。
mRNA_exp = mRNA_exp[!duplicated(x$gene_name),]
x = x[!duplicated(x$gene_name),]
rownames(mRNA_exp) = x$gene_name
mRNA_exp = na.omit(mRNA_exp)
表達矩陣和臨床數據融合
table(colnames(mRNA_exp) %in% clinic_data$sample)
index <- intersect(colnames(mRNA_exp),clinic_data$sample)
mRNA_exp1 = mRNA_exp[,index]
clinic_data1 <- clinic_data[index,]
write.csv(clinic_data1,'clinic_data2.csv')
手工整理臨床數據
在csv中剔除生存時間小于90天和NA的,和存活時間大于3000天的病例
表達矩陣和臨床數據再融合
clinic_data2 <- read.csv('clinic_data2.csv',header = T)
rownames(clinic_data2) <- clinic_data2$sampleID
table(colnames(mRNA_exp) %in% clinic_data2$sampleID)
index1 <- intersect(colnames(mRNA_exp),clinic_data2$sampleID)
mRNA_exp2 = mRNA_exp[,index1]
clinic_data2 <- clinic_data2[index1,]
MCP法
install.packages("MCPcounter-master/Source/",repos=NULL,type="source")
library(MCPcounter)
genes <- data.table::fread("genes.txt",data.table = F)
results<- MCPcounter.estimate(mRNA_exp2,
featuresType= "HUGO_symbols",
probesets=probesets,
genes=genes)
results2 <- results[1:9,]#去除最后一行fbro細胞的行
library(pheatmap)
pheatmap(results2, #熱圖的數據
cluster_rows = F,#行聚類
cluster_cols = T,#列聚類,可以看出樣本之間的區(qū)分度
##annotation_col =annotation_col, #標注樣本分類
annotation_legend=TRUE, # 顯示注釋
show_rownames = T,
show_colnames = F,# 顯示行名
scale = "row", #以行來標準化
color =colorRampPalette(c("blue", "white","red"))(100),#調色
#filename = "heatmap_F.pdf",#是否保存

圖片.png
生存分析
results1 <- as.data.frame(t(results))
Last <- cbind(results1,clinic_data2)
Last$group = ifelse(Last$`B lineage` > median(Last$`B lineage`),'high','low')
library(survival)
library(survminer)
sfit1=survfit(Surv(OS.time, OS)~group, data=Last)
ggsurvplot(sfit1,pval =TRUE, data = Last,risk.table = TRUE)
sfit1=survfit(Surv(DSS.time, DSS)~group, data=Last)
ggsurvplot(sfit1,pval =TRUE, data = Last, risk.table = TRUE)
#surv.cut = surv_cutpoint(Last, time = "OS.time", event = "OS", #variables = "B lineage", minprop = 0.3)
# 最優(yōu)分類值
#opticut = surv.cut$cutpoint$cutpoint
#opticut

圖片.png