DESeq2的基本使用

DEseq預(yù)熱

主要就是這幾個(gè)步驟了。

#準(zhǔn)備count tables
cnts <- matrix(rnbinom(n=1000, mu=100, size=1/0.5),ncol = 10) #構(gòu)建滿(mǎn)足隨機(jī)負(fù)二項(xiàng)分布的一組數(shù)
cond <- factor(rep(1:2,each=5))

#創(chuàng)建DEseqDataSet
dds2 <- DESeqDataSetFromMatrix(cnts,DataFrame(cond), ~cond)

#標(biāo)準(zhǔn)分析
dds2 <- DESeq(dds2)
res <- results(dds2)
##另一種檢驗(yàn),LRT,似然率檢驗(yàn)
#ddsLRT <- DESeq(dds2,test="LRT",reduced=~1)
#resLRT <- results(ddsLRT)

#moderate lfc
resultsNames(dds2)
resLFC <- lfcShrink(dds2,coef = 2,type = "apeglm")

#提取結(jié)果
summary()、subset()

1. counts: 查看每個(gè)樣本在每個(gè)基因上的counts數(shù)

library(DESeq2)
> dds <- makeExampleDESeqDataSet(m=4)
> head(counts(dds))
      sample1 sample2 sample3 sample4
gene1       0       0       2       1
gene2      14      56      50      47
gene3       5       3       3       8
gene4      70     105     116     125
gene5       5       2       2      12
gene6     153      70      70      53
> dds1 <- estimateSizeFactors(dds)
> head(counts(dds1,normalized=TRUE)) #通過(guò)估計(jì)size對(duì)counts數(shù)進(jìn)行標(biāo)準(zhǔn)化
         sample1   sample2    sample3     sample4
gene1   0.000000  0.000000   1.883002   0.9839948
gene2  14.041418 53.289007  47.075041  46.2477573
gene3   5.014792  2.854768   2.824502   7.8719587
gene4  70.207091 99.916889 109.214095 122.9993545
gene5   5.014792  1.903179   1.883002  11.8079380
gene6 153.452641 66.611259  65.905057  52.1517263

2. 構(gòu)建DESeqDataSet

> #from matrix:從數(shù)值矩陣得到,比如用featurecount統(tǒng)計(jì)得到的數(shù)值矩陣
> a <- as.matrix(read.table("6sample_count_matrix.txt",sep="\t",header = T,row.names=1))
> head(a)
           WT1  WT2  WT3 acc1.51 acc1.52 acc1.53
AT1G01010  226  183  247     514     353     383
AT1G01020  386  364  453     369     483     443
AT1G01030   77   65   87      52      62      74
AT1G01040 1322 1258 1621    1333    1526    1641
AT1G01050 1531 1518 2029    1573    1888    1987
AT1G01060  124  151  823     146     197     297
> b <- read.table("coldata.txt",header = T,row.names = 1)
> b
        condition
WT1            WT
WT2            WT
WT3            WT
acc1.51    acc1.5
acc1.52    acc1.5
acc1.53    acc1.5
> all(rownames(b) == colnames(a)) #make sure that it's true
[1] TRUE
> dds3 <- DESeqDataSetFromMatrix(countData = a,
+                                DataFrame(b),
+                                design = ~ condition)
> #from htseq-count:由單個(gè)htseq-count結(jié)果文件合并得到
> c <- read.table("sampletable_for_htseq.txt.txt",sep = "\t", header = F)
> c
       V1                V2     V3
1     WT1 SRR3286802.count2     WT
2     WT2 SRR3286803.count2     WT
3     WT3 SRR3286804.count2     WT
4 acc1.51 SRR3286805.count2 acc1.5
5 acc1.52 SRR3286806.count2 acc1.5
6 acc1.53 SRR3286807.count2 acc1.5
> colnames(c) <- c("samplename","filename","condition")
> dds4 <- DESeqDataSetFromHTSeqCount(c, directory = ".", design = ~condition)

3. run DESeq() & results()——主要分析步驟

#是否需要初步過(guò)濾一下
#keep <- rowSums(counts(dds4)) >= 20 # 對(duì)counts數(shù)進(jìn)行初步的過(guò)濾
#dds4 <- dds4[keep,]
> dds4 <- DESeq(dds4) #三步走:文庫(kù)大小估計(jì);離散程度估計(jì);統(tǒng)計(jì)檢驗(yàn)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
> res4 <- results(dds4) #結(jié)果呈現(xiàn)
> res4
log2 fold change (MLE): condition WT vs acc1.5 
Wald test p-value: condition WT vs acc1.5 
DataFrame with 37884 rows and 6 columns
                  baseMean      log2FoldChange             lfcSE               stat               pvalue                padj
                 <numeric>           <numeric>         <numeric>          <numeric>            <numeric>           <numeric>
AT1G01010  308.89691311743  -0.873161860576389 0.235593150198139  -3.70622770586514 0.000210369178701959 0.00230248659971188
AT1G01020 392.486827906655  -0.014533822556904 0.140844859602549 -0.103190294611512     0.91781194256941    0.95926533475846
AT1G01030 63.2288140763626   0.399739741186851 0.255254339106486   1.56604484212152    0.117338119589006   0.275094993904629
......

4. LFC校正

resultsNames(dds4) #顯示誰(shuí)和誰(shuí)比較
res.shr <- lfcShrink(dds4,coef = 2) #2表示第2列,即LFC這一列
res.shr
res.shr2 <- lfcShrink(dds4,contrast = c("condition","acc1.5","WT")) #contrast和coef不能同時(shí)存在
res.shr2
#更換校正模型
res.ape <- lfcShrink(dds4,coef = 2, type = "apeglm")
res.ash <- lfcShrink(dds4, coef = 2, type = "ashr")
res.ash2 <- lfcShrink(dds4, contrast = c("condition","acc1.5","WT"), type = "ashr")

#這一步只會(huì)對(duì)LFC的值產(chǎn)生影響,p值是沒(méi)有改變的

5. 簡(jiǎn)單作圖

plotCounts: 用以檢驗(yàn)單個(gè)基因在組間的reads數(shù)有多少
plotCounts(dds4,"AT1G01010")
plotMA: 所有樣本normalized counts的平均值和LFC的關(guān)系

從整體上來(lái)看gene的上調(diào)下調(diào)情況,注意用resultsNames()看一下是誰(shuí)和誰(shuí)比較

res.shr2 <- lfcShrink(dds4,coef = 2,type = "apeglm") #校正LFC
plotMA(res.shr2,alpha=0.01,main="apeglm") #alpha=0.01,定義顯著水平

6. results()——從DESeq分析中提取結(jié)果

> res4 <- results(dds4,contrast = c("condition","acc1.5","WT"),alpha = 0.001,tidy = F)
> # dds4經(jīng)過(guò)了DESeq()分析
> # contrast: 定義誰(shuí)和誰(shuí)比較
> # lfcThreshold: 定義了lfc閾值,從MA圖中可以看到紅點(diǎn)減少,此外results()結(jié)果中不滿(mǎn)足lfc閾值的gene的p值都是1
> # alpha: 定義顯著水平
> # tidy: 是否將結(jié)果輸出為data.frame格式
> summary(res4,alpha=0.001)

out of 27432 with nonzero total read count
adjusted p-value < 0.001
LFC > 0 (up)       : 1198, 4.4%
LFC < 0 (down)     : 554, 2%
outliers [1]       : 63, 0.23%
low counts [2]     : 6254, 23%
(mean count < 7)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results

> # res4經(jīng)過(guò)了results()操作
> # alpha: 校正p值的閾值,如果沒(méi)有設(shè)置,會(huì)沿用results()中的alpha參數(shù)

#按照自定義的閾值提取差異基因并導(dǎo)出
#diff_gene_deseq2 <-subset(res, padj < 0.001 & abs(log2FoldChange) > 1)
#write.csv(diff_gene_deseq2,file= "DEG_acc1.5_vs_WT.csv")
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請(qǐng)結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡(jiǎn)書(shū)系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容