多芯片分析

因為目前合并多個測序、芯片數(shù)據(jù)集這一塊并沒有完全統(tǒng)一的標準,方法大概有五六種。公說公有理婆說婆有理,對于我這樣的新手來說,最簡單的是跟隨頂級文章的文章思路或者分析流程和步驟。于是我選取了一篇歐洲泌尿外科的頂級文章,從這篇文章的補充材料可以看出來:

移除批次效應前

image.png

image.png

image.png

image.png

移除批次效應后

image.png

image.png

image.png

作者將所有的芯片測序數(shù)據(jù)分為affy和illumina兩類;
將affy公司的芯片通過標準的affy公司流程分析及combat;
illumina同樣是通過標準的illumina公司流程分析及combat;
最后將兩類芯片combat到一起,上一步是消除batch effect是不同的cohort,而這一步是affy和illumina兩個平臺的區(qū)別。


image.png

加載包

###step1 加載包################################
# =======================================================

# if (!requireNamespace("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# 
# BiocManager::install("bladderbatch")

rm(list=ls()) 

library("sva")

options(stringsAsFactors=FALSE)

setwd('D:\\train\\data')

library(bladderbatch)

加載用于分析的數(shù)據(jù)

# =======================================================
###step2 加載用于分析的數(shù)據(jù)################################
#包括一個表達量矩陣和一個分組文件
# =======================================================
data(bladderdata)

edata = exprs(bladderEset)


edata[1:5,1:5]

pheno = pData(bladderEset)

pheno[1:5,1:4]

dt <- data.frame(cel=rownames(pheno), pheno)

dt[1:5,1:5]

繪制圖片顯示以前聚類效果

# =======================================================
###step3 繪制圖片顯示合并以前的聚類結果##################
# =======================================================

PCA_raw <- prcomp(t(edata), scale = FALSE)

dataGG <- data.frame(PC1 = PCA_raw$x[,1], PC2 = PCA_raw$x[,2],
                     Disease = dt$cancer,
                     batch = dt$batch)

dataGG$batch <- as.factor(dataGG$batch)


library(ggplot2)

(qplot(PC1, PC2, data = dataGG, color = Disease,
       main = "PCA plot of the raw data (log-transformed)", size = I(2),
       asp = 1.0)
  + scale_colour_brewer(palette = "Set2"))


library(ggplot2)
(qplot(PC1, PC2, data = dataGG, color = batch,
       main = "PCA plot of the raw data (log-transformed)", size = I(2),
       asp = 1.0)
  + scale_colour_brewer(palette = "Set2"))


boxplot(edata, target = "core",
        main = "Boxplots of log2-intensities for the raw data")


dist_mat <- dist(t(edata))
clustering <- hclust(dist_mat, method = "complete")
par(mfrow=c(2,1))
plot(clustering, labels = pheno$batch)
plot(clustering, labels = pheno$cancer)
image.png

image.png

image.png

image.png

從這些圖片看出有明顯批次效應,批次效應出現(xiàn)在以下情形:

一個實驗不同部分在不同時間完成;
一個實驗不同部分不同人完成;
試劑用量、芯片、實驗儀器不同;
自己的數(shù)據(jù)與網(wǎng)站數(shù)據(jù)混用

Combat參數(shù)(prarametric)或者非參數(shù)(non-prarametric)的經(jīng)驗貝葉斯框架進行批次效應校正。

移除批次效應

# =======================================================
####step4 combat移除批次效應 ############################
# =======================================================

library(bladderbatch)

data(bladderdata)

pheno = pData(bladderEset)
# add fake age variable for numeric
pheno$age = c(1:7, rep(1:10, 5))

# write.table(data.frame(cel=rownames(pheno), pheno),
#             row.names=F, quote=F, sep="\t",
#             file="bladder-pheno.txt")

edata = exprs(bladderEset)

# write.table(edata, row.names=T, 
#             quote=F, sep="\t", file="bladder-expr.txt")
# use dataframe instead of matrix

mod = model.matrix(~as.factor(cancer) + age, data=pheno)

t = Sys.time()

# parametric adjustment

cdata = ComBat(dat=edata, batch=as.factor(pheno$batch), 
               mod=mod,par.prior=TRUE, prior.plots=TRUE)

print(Sys.time() - t)

print(cdata[1:5, 1:5])

# write.table(cdata, "r-batch.txt", sep="\t", quote=F)
image.png

繪制聚類后的圖片

# =======================================================
####step5 繪制合并后聚類圖片############################
# =======================================================

dt <- data.frame(cel=rownames(pheno), pheno)


PCA_raw <- prcomp(t(cdata), scale = FALSE)


dataGG <- data.frame(PC1 = PCA_raw$x[,1], PC2 = PCA_raw$x[,2],
                     Disease = dt$cancer,
                     batch = dt$batch)

dataGG$batch <- as.factor(dataGG$batch)

library(ggplot2)
(qplot(PC1, PC2, data = dataGG, color = Disease,
       main = "PCA plot of the raw data (log-transformed)", size = I(2),
       asp = 1.0)
  + scale_colour_brewer(palette = "Set2"))

library(ggplot2)
(qplot(PC1, PC2, data = dataGG, color = batch,
       main = "PCA plot of the raw data (log-transformed)", size = I(2),
       asp = 1.0)
  + scale_colour_brewer(palette = "Set2"))

boxplot(cdata, target = "core",
        main = "Boxplots of log2-intensities for the raw data")

dist_mat <- dist(t(cdata))
clustering <- hclust(dist_mat, method = "complete")
par(mfrow=c(2,1))
plot(clustering, labels = pheno$batch)
plot(clustering, labels = pheno$cancer)

前后對比,可以發(fā)現(xiàn)不同batch基本不再是涇渭分明,而我們呢關心的cancer和normal涇渭分明了,便于下一步分析。


image.png

image.png

image.png

image.png

實例驗證 數(shù)據(jù)

# =======================================================
####step6 實例驗證 ################################
# =======================================================
rm(list=ls()) 
library(dplyr)
library(tibble)
library(tidyr)
library("sva")
options(stringsAsFactors=FALSE)
setwd('D:\\train\\data\\multi_micro')
data1 <- read.table('GSE68799.txt', header = T, sep = '\t')
data2 <- read.table('GSE102349.txt', header = T, sep = '\t')
data3 <- read.table(file = 'GSE118719.tsv', sep = '\t', header = TRUE)
data1[1:5,1:5]
data2[1:5,1:5]
data3[1:5,1:5]
data1 <- data1 %>%
  tidyr::separate(GeneID, into=c('geneSymbol', 'chr'), sep='\\_')%>%
  dplyr::select(-chr)
data1[1:5,1:5]
data2$geneSymbol <- data2$Gene.Symbol
data2$Gene.Symbol <- NULL
data3$Geneid <- NULL
a <- intersect(data2$geneSymbol, data1$geneSymbol)
a <- intersect(a, data3$geneSymbol)
data1 <- data1[which(data1$geneSymbol %in% a), ]
data2 <- data2[which(data2$geneSymbol %in% a), ]
data3 <- data3[which(data3$geneSymbol %in% a), ]

data1 <- data1 %>%
  distinct(geneSymbol, .keep_all = T)
data2 <- data2 %>%
  distinct(geneSymbol, .keep_all = T)
data3 <- data3 %>%
  distinct(geneSymbol, .keep_all = T)%>%
  column_to_rownames(var = 'geneSymbol')
data3 <- log2(data3 +1)
data3[1:5,1:5]
data3 <- data3 %>%
  rownames_to_column(var = 'geneSymbol')
data <- merge(data1, data2, by='geneSymbol')

data <- merge(data, data3, by='geneSymbol')%>%
  column_to_rownames(var = 'geneSymbol')
data[1:5,1:5]
metadat <- as.data.frame(colnames(data))
# write.csv(metadat, file = 'metadat.csv')

write.csv(data, file = 'data.csv')

# =======================================================
####step7 移除批次效應 ################################
# =======================================================
setwd('D:\\train\\data\\multi_micro')
rm(list=ls()) 
library("sva")
options(stringsAsFactors=FALSE)
library(bladderbatch)
data(bladderdata)
edata  <- read.csv('data.csv', header = T, row.names = 1)
pheno <- read.csv('metadat.csv', header = T)
dt <- pheno
PCA_raw <- prcomp(t(edata), scale = FALSE)
dataGG <- data.frame(PC1 = PCA_raw$x[,1], PC2 = PCA_raw$x[,2],
                     Disease = dt$cancer,
                     batch = dt$batch)

dataGG$batch <- as.factor(dataGG$batch)

library(ggplot2)
(qplot(PC1, PC2, data = dataGG, color = Disease,
       main = "PCA plot of the raw data (log-transformed)", size = I(2),
       asp = 1.0)
  + scale_colour_brewer(palette = "Set2"))

library(ggplot2)
(qplot(PC1, PC2, data = dataGG, color = batch,
       main = "PCA plot of the raw data (log-transformed)", size = I(2),
       asp = 1.0)
  + scale_colour_brewer(palette = "Set2"))

dist_mat <- dist(t(edata))
clustering <- hclust(dist_mat, method = "complete")
par(mfrow=c(2,1))
plot(clustering, labels = pheno$batch)
plot(clustering, labels = pheno$cancer)


#再做一個分組列,用于批次效應中排除項。
pheno$hasCancer <- as.numeric(pheno$cancer == "T")  
#分組模型
model <- model.matrix(~hasCancer, data = pheno)
t = Sys.time()
cdata <- ComBat(dat = as.matrix(edata),
                       batch = pheno$batch, mod = model)

print(Sys.time() - t)

print(cdata[1:5, 1:5])

PCA_raw <- prcomp(t(cdata), scale = FALSE)

dataGG <- data.frame(PC1 = PCA_raw$x[,1], PC2 = PCA_raw$x[,2],
                     Disease = dt$cancer,
                     batch = dt$batch)

dataGG$batch <- as.factor(dataGG$batch)
library(ggplot2)
(qplot(PC1, PC2, data = dataGG, color = Disease,
       main = "PCA plot of the raw data (log-transformed)", size = I(2),
       asp = 1.0)
  + scale_colour_brewer(palette = "Set2"))

library(ggplot2)
(qplot(PC1, PC2, data = dataGG, color = batch,
       main = "PCA plot of the raw data (log-transformed)", size = I(2),
       asp = 1.0)
  + scale_colour_brewer(palette = "Set2"))

dist_mat <- dist(t(cdata))
clustering <- hclust(dist_mat, method = "complete")
par(mfrow=c(2,1))
plot(clustering, labels = pheno$batch)
plot(clustering, labels = pheno$cancer)

question:
同屬于illumina平臺的batch1和batch2的移除批次效應結果很滿意,但是affy效果一般;
1.可能是affy數(shù)據(jù)表達矩陣為原始數(shù)據(jù),而illumina為標準化后的數(shù)據(jù),所以差距大;
2.這篇頂刊提示我么為什么對affy標準化后才和illumina合并

最后編輯于
?著作權歸作者所有,轉載或內容合作請聯(lián)系作者
【社區(qū)內容提示】社區(qū)部分內容疑似由AI輔助生成,瀏覽時請結合常識與多方信息審慎甄別。
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發(fā)布,文章內容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務。
禁止轉載,如需轉載請通過簡信或評論聯(lián)系作者。

相關閱讀更多精彩內容

友情鏈接更多精彩內容