UCSC Xena 瀏覽器 分類打包,直接下載
gdcRNAtools基于gdc-client下載并簡(jiǎn)化整理,用R語(yǔ)言完成
備份的TCGA數(shù)據(jù)來(lái)源于xena,ucsc的,都在,https://share.weiyun.com/5zLnKmO
需求最大的是tcga數(shù)據(jù)庫(kù)的生存分析和表達(dá)量差異
看看這兩個(gè)視頻:
https://www.bilibili.com/video/av25643438?p=9
https://www.bilibili.com/video/av49363776?p=6
(from 曾老師)
1.xena
if(F){
download.file(url = "https://gdc.xenahubs.net/download/TCGA-CHOL.htseq_counts.tsv.gz",destfile = "counts.tsv.gz")
download.file(url = "https://gdc.xenahubs.net/download/TCGA-CHOL.GDC_phenotype.tsv.gz",destfile = "phenotype.tsv.gz")
download.file(url = "https://gdc.xenahubs.net/download/TCGA-CHOL.survival.tsv.gz",destfile = "survival.tsv.gz")
}
dat = read.table("counts.tsv.gz",check.names = F,row.names = 1,header = T)
逆轉(zhuǎn)log
dat = as.matrix(2^dat - 1)
dat[1:4,1:4]
as.character(dat[1:100,1:10]) #有一些小數(shù)
用apply轉(zhuǎn)換為整數(shù)矩陣
exp = apply(dat, 2, as.integer) #對(duì)dat矩陣的2每一列數(shù)值as.integer取整數(shù)
exp[1:4,1:4] #行名消失
rownames(exp) = rownames(dat) #補(bǔ)上行名
clinical = read.table("phenotype.tsv.gz",fill = T,header = T,sep = "\t")
surv = read.table("survival.tsv.gz",header = T)
clinical[1:4,1:4]
surv[1:4,1:4]
2.GDCRNATools #自制教程,可自學(xué)
http://bioconductor.org/packages/devel/bioc/vignettes/GDCRNATools/inst/doc/GDCRNATools.html
其他來(lái)源的RNA-seq數(shù)據(jù)
GEO
library(GEOquery)
eSet = getGEO("GSE162550",destdir = F,getGPL = F)
#數(shù)據(jù)下載跟之前的芯片數(shù)據(jù)下載方式不一樣,不能從r包直接加載。去官網(wǎng)看補(bǔ)充數(shù)據(jù),并看清楚具體數(shù)據(jù)類型
rm(list = ls())
dat = read.table("GSE162550_gene_sample_count_with_symbol.xls",
fill = T,sep = "\t",header = T)
table(!duplicated(dat$Symbol)) #行名不能重復(fù),此處看symble有無(wú)重復(fù)
o = order(rowSums(dat[,4:9]),decreasing = T)
view(o)
dat = dat[!duplicated(dat$Symbol),] #Symbol去重復(fù),行名
dat = dat[dat$Symbol!="---",] #Symbol去掉奇奇怪怪的數(shù)據(jù)---,行名
exp = dat[,4:9]
rownames(exp) = dat$Symbol #加上行名