參考學習資料:TCGA的cox模型構建和風險森林圖
接著之前做差異分析得到的數(shù)據(jù)數(shù)據(jù)
再來進一步分析:
1.準備輸入數(shù)據(jù)
rm(list = ls())
options(stringsAsFactors = F)
load("tcga-kirc-raw.Rdata")
exprSet = expr[,group_list=="tumor"]
dim(exprSet)
## [1] 812 545
head(meta)
exprSet[1:4,1:4]
head(colnames(exprSet))
head(meta$submitter_id)
expr2=exprSet[,meta$submitter_id%in%substring(colnames(exprSet),1,12)]
meta=meta[meta$submitter_id%in%substring(colnames(exprSet),1,12),]
dim(meta)
dim(expr2)
exprSet=expr2
dim(exprSet)
s1=unique(colnames(expr2))#去重復
colnames(expr2)=substring(colnames(exprSet),1,12)
head(colnames(expr2))
s2=unique(colnames(expr2))
expr3=expr2[,s2]
#meta$submitter_id%in%colnames(expr3)
meta=meta[meta$submitter_id%in%colnames(expr3),]
exprSet=expr3
dim(meta)
## [1] 495 8
dim(exprSet)
## [1] 812 495
2.挑選感興趣的基因構建coxph模型
e=t(exprSet[c("hsa-mir-21","hsa-mir-181a-1","hsa-mir-424",'hsa-mir-192',"hsa-mir-195"),])
e=log2(e)
colnames(e)=c('miR21','miR181a','miR424','miR192','miR195')
dat=cbind(meta,e)
dat$gender=factor(dat$gender)
dat$ajcc_pathologic_stage=factor(dat$ajcc_pathologic_stage)
colnames(dat)
dat$status=ifelse(dat$vital_status=="Alive",1,0)
colnames(dat)[8]="Stage"
#用survival::coxph()函數(shù)構建模型
library(survival)
library(survminer)
s=Surv(age_at_diagnosis, status) ~ gender + Stage + miR21+miR181a+miR424+miR192+miR195
model <- coxph(s, data = dat )

建模前
3.模型可視化-森林圖
options(scipen=1)
ggforest(model, data =dat, main = "Hazard ratio",
cpositions = c(0.10, 0.22, 0.4),
fontsize = 1.0,
refLabel = "1",
noDigits = 4)
4.模型預測
fp <- predict(model)
summary(model,data=dat)$concordance
library(Hmisc)
options(scipen=200)
#with(dat,rcorr.cens(fp,Surv(age_at_diagnosis, status)))
模型預測這一步看教程是個例子,弄了蠻久還是不太明白。
5.切割數(shù)據(jù)構建模型并預測
5.1 切割數(shù)據(jù)
library(caret)
set.seed(12345679)
sam<- createDataPartition(dat$status, p = .5, list = FALSE)
head(sam)
## Resample1
## [1,] 5
## [2,] 9
## [3,] 13
## [4,] 17
## [5,] 19
## [6,] 21
train <- exprSet[,sam]
test <- exprSet[,-sam]
train_meta <- meta[sam,]
test_meta <- meta[-sam,]
prop.table(table(train_meta$ajcc_pathologic_stage))
prop.table(table(test_meta$ajcc_pathologic_stage))
prop.table(table(test_meta$race))
prop.table(table(train_meta$race))
5.2 切割后的train數(shù)據(jù)集建模
e=t(train[c("hsa-mir-21","hsa-mir-181a-1","hsa-mir-424",'hsa-mir-192',"hsa-mir-195"),])
e=log2(e)
colnames(e)=c('miR21','miR181a','miR424','miR192','miR195')
dat=cbind(train_meta,e)
dat$gender=factor(dat$gender)
colnames(dat)[8]="Stage"
dat$Stage=factor(dat$Stage)
colnames(dat)
dat$status=ifelse(dat$vital_status=="Alive",1,0)
s=Surv(age_at_diagnosis, status) ~ gender + Stage + miR21+miR181a+miR424+miR192+miR195
model <- coxph(s, data = dat )
5.3 模型可視化
options(scipen=1)
ggforest(model, data =dat,
main = "Hazard ratio",
cpositions = c(0.10, 0.22, 0.4),
fontsize = 1.0,
refLabel = "1", noDigits = 4)

建模后
Concordance Index:0.65
C-index用于計算生存分析中的COX模型預測值與真實之間的區(qū)分度(discrimination),也稱為Harrell's concordanceindex。C-index在0.5-1之間。0.5為完全不一致,說明該模型沒有預測作用,1為完全一致,說明該模型預測結果與實際完全一致。
本次建模不太行,但是還是有一定的預測作用。涉及到公式方面真的有點蒙,過程比較復雜不好理解,但是我應用的話知道判斷的標準即可。