第一部分 背景介紹
為了方便起見,直接選用airway的數(shù)據(jù)作為訓(xùn)練數(shù)據(jù)
library(airway)
library(edgeR)
library(limma)
airway
#class: RangedSummarizedExperiment
#dim: 64102 8
#metadata(1): ''
#assays(1): counts
#rownames(64102): ENSG00000000003 ENSG00000000005 ... LRG_98 LRG_99
#rowData names(0):
#colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
#colData names(9): SampleName cell ... Sample BioSample
##airway是一個(gè)`RangedSummarizedExperiment`對(duì)象,我們可以直接用`colData()`提取它的樣本信息
colData(airway)
#DataFrame with 8 rows and 9 columns
# SampleName cell dex albut Run avgLength Experiment Sample BioSample
# <factor> <factor> <factor> <factor> <factor> <integer> <factor> <factor> <factor>
#SRR1039508 GSM1275862 N61311 untrt untrt SRR1039508 126 SRX384345 SRS508568 SAMN02422669
#SRR1039509 GSM1275863 N61311 trt untrt SRR1039509 126 SRX384346 SRS508567 SAMN02422675
#SRR1039512 GSM1275866 N052611 untrt untrt SRR1039512 126 SRX384349 SRS508571 SAMN02422678
#SRR1039513 GSM1275867 N052611 trt untrt SRR1039513 87 SRX384350 SRS508572 SAMN02422670
#SRR1039516 GSM1275870 N080611 untrt untrt SRR1039516 120 SRX384353 SRS508575 SAMN02422682
#SRR1039517 GSM1275871 N080611 trt untrt SRR1039517 126 SRX384354 SRS508576 SAMN02422673
#SRR1039520 GSM1275874 N061011 untrt untrt SRR1039520 101 SRX384357 SRS508579 SAMN02422683
#SRR1039521 GSM1275875 N061011 trt untrt SRR1039521 98 SRX384358 SRS508580 SAMN02422677
##可以看到,第三列就是我們想要的表型信息,于是我們提取這個(gè)表型信息
pheno <- colData(airway)[,3]
##之后用assay()提取表達(dá)矩陣
airway <- assay(airway)
head(airway)
# SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 SRR1039520 SRR1039521
#ENSG00000000003 679 448 873 408 1138 1047 770 572
#ENSG00000000005 0 0 0 0 0 0 0 0
#ENSG00000000419 467 515 621 365 587 799 417 508
#ENSG00000000457 260 211 263 164 245 331 233 229
#ENSG00000000460 60 55 40 35 78 63 76 60
#ENSG00000000938 0 0 2 0 1 0 0 0
基因注釋
##上一步我們就得到了它的表達(dá)矩陣以及表型信息,之后將對(duì)這些基因進(jìn)行注釋,可以看到,表達(dá)矩陣的基因都是ensembl形式的,我們要找到它的`external_gene_name`,`entrezgene`,`chromosome_name`,`gene_biotype`等注釋信息已進(jìn)行后續(xù)操作
library(biomaRt)
gene.names <- rownames(airway)
ensembl <- useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset="hsapiens_gene_ensembl",
host="aug2017.archive.ensembl.org") # Ensembl version 90.
ensemblGenes <- getBM(attributes=c('ensembl_gene_id', 'chromosome_name', 'gene_biotype',
'external_gene_name', 'entrezgene'), filters='ensembl_gene_id',
values=gene.names, mart=ensembl)
head(ensemblGenes)
#ensembl_gene_id chromosome_name gene_biotype external_gene_name entrezgene
#1 ENSG00000000003 X protein_coding TSPAN6 7105
#2 ENSG00000000005 X protein_coding TNMD 64102
#3 ENSG00000000419 20 protein_coding DPM1 8813
#4 ENSG00000000457 1 protein_coding SCYL3 57147
#5 ENSG00000000460 1 protein_coding C1orf112 55732
#6 ENSG00000000938 1 protein_coding FGR 2268
features <- ensemblGenes[match(gene.names, ensemblGenes$ensembl_gene_id),]
features$ensembl_gene_id <- gene.names
features$entrezgene <- gsub(" ", "", as.character(features$entrezgene))
row.names(features) <- gene.names
head(features)
# ensembl_gene_id chromosome_name gene_biotype external_gene_name entrezgene
#ENSG00000000003 ENSG00000000003 X protein_coding TSPAN6 7105
#ENSG00000000005 ENSG00000000005 X protein_coding TNMD 64102
#ENSG00000000419 ENSG00000000419 20 protein_coding DPM1 8813
#ENSG00000000457 ENSG00000000457 1 protein_coding SCYL3 57147
#ENSG00000000460 ENSG00000000460 1 protein_coding C1orf112 55732
#ENSG00000000938 ENSG00000000938 1 protein_coding FGR 2268
第二部分 得到DGEList,并對(duì)其進(jìn)行操作
y = DGEList(airway)
y$samples$group <- pheno
y$features = features
##這樣y中就包含了樣本信息以及基因的信息
library(scater)
new.row.names <- uniquifyFeatureNames(
features$ensembl_gene_id,
features$external_gene_name
)
rownames(y) <- new.row.names
head(rownames(y), 20)
#[1] "TSPAN6" "TNMD" "DPM1" "SCYL3" "C1orf112" "FGR" "CFH" "FUCA2" "GCLC" "NFYA" "STPG1"
#[12] "NIPAL3" "LAS1L" "ENPP4" "SEMA3F" "CFTR" "ANKIB1" "CYP51A1" "KRIT1" "RAD52"
使用symbol_id為y命名,并對(duì)重復(fù)的symbol_id與ensembl_id結(jié)合以防止重復(fù)id
uniquifyFeatureNames(
ID=paste0("ENSG0000000", 1:5),
names=c("A", NA, "B", "C", "A")
)
# "A_ENSG00000001" "ENSG00000002" "B" "C" "A_ENSG00000005"
第三部分 數(shù)據(jù)預(yù)處理
原始數(shù)據(jù)的轉(zhuǎn)換
進(jìn)行后續(xù)處理一般都不會(huì)用raw counts的,因?yàn)榇嬖跍y(cè)序深度、文庫(kù)大小的差別,這樣的結(jié)果是不準(zhǔn)確的
一般的做法是:利用標(biāo)準(zhǔn)化算法,如CPM(counts per million), log-CPM (log2-counts per million), RPKM (reads per kilobase of transcript per million), FPKM(fragments per kilobase oftranscript per million)等去除文庫(kù)大小、深度的影響。和RPKM、FPKM不同的是,CPM和log-CPM不需要考慮feature length的差異,也就是說基因長(zhǎng)度在統(tǒng)計(jì)時(shí)被當(dāng)成常數(shù),只考慮不同處理下的不同,而不會(huì)受長(zhǎng)度的影響
cpm使用cpm()函數(shù);RPKM使用rpkm函數(shù),都屬于edegR
cpm <- cpm(y)
lcpm <- cpm(y, log=TRUE)
預(yù)篩選基因
所有數(shù)據(jù)集都會(huì)存在一些檢測(cè)不到的基因或者表達(dá)很少的基因,這些基因?qū)τ诤罄m(xù)分析來說不僅對(duì)結(jié)果沒有影響,還會(huì)增加下游分析的計(jì)算量,所以我們需要設(shè)計(jì)一定的閾值將這些基因去除
首先看一下表達(dá)量為0的基因
table(rowSums(y$counts==0)==6)
#FALSE TRUE
#61682 2420
##在所有樣本中表達(dá)量都為0的基因共有2420個(gè)
過濾基因:標(biāo)準(zhǔn)就是lcpm在所有樣本表達(dá)量的平均值大于0,也就是cpm大于1
keep <- rowMeans(lcpm>0)>0
y <- y[keep ,, keep.lib.sizes=FALSE]
dim(y)
#[1] 15806 8
數(shù)據(jù)標(biāo)準(zhǔn)化
對(duì)于edgeR來說,進(jìn)行差異表達(dá)時(shí)為什么需要counts矩陣呢,因?yàn)?code>edgeR有自己的標(biāo)準(zhǔn)化步驟,edgeR使用TMM(trimmed mean of M-values)算法,利用edgeR中函數(shù)calNormFactors()
標(biāo)準(zhǔn)化用到的normalisation factors 就在DEGList中的x$samples$norm.factors
y <- calcNormFactors(y, method = "TMM")
y$samples$norm.factors
#[1] 1.0550718 1.0217435 0.9897916 0.9498495 1.0307290 0.9785444 1.0258677 0.9535888
如何檢查是否標(biāo)準(zhǔn)化,可以做一個(gè)箱線圖顯示
#模擬一個(gè)數(shù)據(jù)
x2 <- y
x2$samples$norm.factors <- 1
x2$counts[,1] <- ceiling(x2$counts[,1]*0.05) #第一個(gè)樣本count縮小到原來5%
x2$counts[,2] <- x2$counts[,2]*5 # 第二個(gè)樣本擴(kuò)大到原來5倍
#畫圖
par(mfrow=c(1,2))
lcpm <- cpm(x2, log=TRUE)
boxplot(lcpm, las=2, col=col, main="")
title(main="A. Example: Unnormalised data",ylab="Log-cpm")
x2 <- calcNormFactors(x2)
x2$samples$norm.factors
#[1] 0.06578336 6.03546029 1.16646293 1.12351485 1.21732873 1.15208812 1.21158739 1.13103354
lcpm <- cpm(x2, log=TRUE)
boxplot(lcpm, las=2, col=col, main="")
title(main="B. Example: Normalised data",ylab="Log-cpm")
