加載到表達(dá)矩陣,分組,生存情況
rm(list=ls())
load(file = 'Rdata/TCGA_KIRC_mut.Rdata')
load(file = 'Rdata/TCGA-KIRC-miRNA-example.Rdata')
group_list=ifelse(as.numeric(substr(colnames(expr),14,15)) < 10,'tumor','normal')
table(group_list)
load(file='Rdata/survival_input.Rdata')
head(phe)
exprSet[1:4,1:4]
切割數(shù)據(jù)組
方法一:完全隨機(jī)切割
dim(expr)
set.seed(1234567890)#設(shè)定一個(gè)數(shù)值,使得隨機(jī)數(shù)不會(huì)每次都變化
#sample(1:10,3)
k=sample(1:593,300)#在600個(gè)數(shù)字中隨機(jī)選300個(gè)
t_exp=expr[,k] ##
v_exp=expr[,-k]##
#把數(shù)據(jù)分成了兩組
table(ifelse(substr(colnames(t_exp),14,15)=='01','tumor','normal'))
table(ifelse(substr(colnames(v_exp),14,15)=='01','tumor','normal'))
t_tumor=t_exp[,substr(colnames(t_exp),14,15)=='01']
v_tumor=t_exp[,substr(colnames(v_exp),14,15)=='01']
t_phe= phe[match(substr(colnames(t_tumor),1,12),phe$ID),]
v_phe=phe[match(substr(colnames(v_tumor),1,12),phe$ID),]
table(t_phe$stage)
table(v_phe$stage)
方法二:recipes包,切割數(shù)據(jù),可以保證分組平衡(一般用這個(gè)方法比較好)
#install.packages("recipes")
if(F){
## 切割數(shù)據(jù),可以平衡
library(caret)
set.seed(123456789)
sam<- createDataPartition(phe$event, p = .5,list = FALSE)
train <- phe[sam,]
test <- phe[-sam,]
#查看兩組一些臨床參數(shù)切割比例
prop.table(table(train$stage))
prop.table(table(test$stage))
}
t_exp= expr[,match(train$ID,substr(colnames(expr),1,12))]
v_exp=expr[,-match(train$ID,substr(colnames(expr),1,12))]
#match返回X第一次在Y種的位置
#v_exp=expr[,-which(colnames(expr)%in%train$ID)]##找到位置之后刪除
#把數(shù)據(jù)分成了兩組
table(ifelse(substr(colnames(t_exp),14,15)=='01','tumor','normal'))
table(ifelse(substr(colnames(v_exp),14,15)=='01','tumor','normal'))
t_tumor=t_exp[,substr(colnames(t_exp),14,15)=='01']
v_tumor=v_exp[,substr(colnames(v_exp),14,15)=='01']
t_phe= phe[match(substr(colnames(t_tumor),1,12),phe$ID),]
v_phe=phe[match(substr(colnames(v_tumor),1,12),phe$ID),]
table(t_phe$stage)
table(v_phe$stage)
head(t_phe)
t_tumor[1:4,1:4]
最后:進(jìn)行了lars回歸,把數(shù)據(jù)分成兩部分,一部分算,一部分用來驗(yàn)證
library(lars)
library(glmnet)
x=t(log2(t_tumor+1))
y=t_phe$event
cv_fit <- cv.glmnet(x=x, y=y, alpha = 1, nlambda = 1000)
#進(jìn)行了一個(gè)lars回歸,X是表達(dá)數(shù)據(jù),Y是生存情況
plot.cv.glmnet(cv_fit)
# 兩條虛線分別指示了兩個(gè)特殊的λ值:
c(cv_fit$lambda.min,cv_fit$lambda.1se)
model_lasso <- glmnet(x=x, y=y, alpha = 1, lambda=cv_fit$lambda.1se)
#把預(yù)測(cè)和這個(gè)數(shù)據(jù)算出的結(jié)果相比較,看看預(yù)測(cè)準(zhǔn)確度
lasso.prob <- predict(cv_fit, newx=t(log2(v_tumor+1)) , s=c(cv_fit$lambda.min,cv_fit$lambda.1se) )
re=cbind(v_phe$event ,lasso.prob)
## https://vip.biotrainee.com/d/812-
library(ROCR)
library(glmnet)
library(caret)
# calculate probabilities for TPR/FPR for predictions
pred <- prediction(re[,2], re[,1])
perf <- performance(pred,"tpr","fpr")
performance(pred,"auc") # shows calculated AUC for model
plot(perf,colorize=FALSE, col="black") # plot ROC curve
lines(c(0,1),c(0,1),col = "gray", lty = 4 )
#把數(shù)據(jù)分成兩部分,一部分算,一部分用來驗(yàn)證
最后
感謝jimmy的生信技能樹團(tuán)隊(duì)!
感謝導(dǎo)師岑洪老師!
感謝健明、孫小潔,慧美等生信技能樹團(tuán)隊(duì)的老師一路以來的指導(dǎo)和鼓勵(lì)!
文中代碼來自生信技能樹jimmy老師!