跟著一區(qū)文章學(xué)通單細(xì)胞(一):批量讀取合并數(shù)據(jù)、批量質(zhì)控和標(biāo)準(zhǔn)化


前言

生信作曲家一直致力于構(gòu)建國內(nèi)和諧、開放、可持續(xù)的生物信息交流平臺。生信作曲家的目標(biāo)是為每個科研人掃清科研路上的一切障礙,讓生物信息學(xué)人人可做。目前,生信作曲家已推出"跟著一區(qū)文章學(xué)通單細(xì)胞"(上輯更新完畢),CIBERSORTx, NTP, TICPE,VIPER算法等多種強(qiáng)力教程,開辦生信中秋分享會、小年夜分享會、暑期集訓(xùn)營和冬季集訓(xùn)營等多次。機(jī)器學(xué)習(xí)從入門到精通等板塊也更新在即。

眾所周知,單細(xì)胞系列教程在全網(wǎng)已經(jīng)很多。相比于其他教程,生信作曲家希望為大家?guī)?strong>基于科研問題的,帶著科研思考的單細(xì)胞轉(zhuǎn)錄組全套教程。這是本教程最具特色的之處,也是為什么你需要學(xué)習(xí)的原因。

本套教程來源于這篇今年2021年12月發(fā)表在Oncogene上的文章。?doi: 10.1038/s41388-021-02054-3。之所以選擇oncogene而不是CNS級別的刊物,是因?yàn)橐幌騺淼膶?shí)踐告訴我們,往往這些口碑較好的一區(qū)文章的思路、技術(shù)、格局,恰恰是我們普通科研人能最大化汲取的。

2022年伊始的第一篇文章,就要帶領(lǐng)大家學(xué)會如何讀入單細(xì)胞數(shù)據(jù),以及預(yù)處理。示例數(shù)據(jù)閱讀原文,密碼pi8t

本節(jié)內(nèi)容模式相對比較固定。大家按部就班跑就可以

代碼實(shí)操

#####################預(yù)處理

#?加載R包
library(Seurat)
library(dplyr)
library(reticulate)
library(sctransform)
library(cowplot)
library(ggplot2)
library(viridis)
library(tidyr)
library(magrittr)
library(reshape2)
library(readxl)
library(progeny)
library(readr)
library(stringr)

#設(shè)定閾值
nFeature_lower?<-?500
nFeature_upper?<-?10000
nCount_lower?<-?1000
nCount_upper?<-?100000
pMT_lower?<-?0
pMT_upper?<-?30
pHB_lower?<-?0
pHB_upper?<-?5

theme_set(theme_cowplot())

#設(shè)定配色方案
use_colors?<-?c(
??Tumor?=?"brown2",
??Normal?=?"deepskyblue2",
??G1?=?"#46ACC8",
??G2M?=?"#E58601",
??S?=?"#B40F20",
??Epithelial?=?"seagreen",
??Immune?=?"darkgoldenrod2",
??Stromal?=?"steelblue",
??p018?=?"#E2D200",
??p019?=?"#46ACC8",
??p023?=?"#E58601",
??p024?=?"#B40F20",
??p027?=?"#0B775E",
??p028?=?"#E1BD6D",
??p029?=?"#35274A",
??p030?=?"#F2300F",
??p031?=?"#7294D4",
??p032?=?"#5B1A18",
??p033?=?"#9C964A",
??p034?=?"#FD6467")

#?加載數(shù)據(jù)和質(zhì)量控制

###工作目錄根據(jù)自己的文件夾做一定調(diào)整,例如../,./data/
###?樣本列表,利用read_excel
samples?<-?read_excel("./metadata/patients_metadata.xlsx",?range?=?cell_cols("A:A"))?%>%?.$sample_id

###?批量載入標(biāo)準(zhǔn)的cellranger文件,閱讀原文獲取數(shù)據(jù),為了大家運(yùn)行便利,我們讀6個腺癌進(jìn)來,3正常,3腫瘤
###?為什么我知道是腺癌請看metadata.excel文件
###?15:20用于挑選出sample的范圍
###?使用paste0函數(shù)進(jìn)行準(zhǔn)確定位到某個文件夾
###?注意read10X的三聯(lián)文件,matrix,features,barcodes
###?cellranger為公司返還的一般形式,這里對完全0基礎(chǔ)的同學(xué)而言是最大的理解障礙
for?(i?in?seq_along(samples)[15:20]){
??assign(paste0("scs_data",?i),?Read10X(data.dir?=?paste0("./cellranger/",?samples[i],?"/filtered_feature_bc_matrix")))
}

###?創(chuàng)建seurat對象,也是創(chuàng)建6個對象
for?(i?in?seq_along(samples)[15:20]){
??assign(paste0("seu_obj",?i),?CreateSeuratObject(counts?=?eval(parse(text?=?paste0("scs_data",?i))),?project?=?samples[i],?min.cells?=?3))
}

###?merge一鍵融合,從15-20個樣本
seu_obj?<-?merge(seu_obj15,?y?=?c(seu_obj16,?seu_obj17,?seu_obj18,?seu_obj19,?seu_obj20),?add.cell.ids?=?samples[15:20],?project?=?"lung")

###?計(jì)算線粒體等基因的比例
seu_obj?<-?PercentageFeatureSet(seu_obj,?pattern?=?"^MT-",?col.name?=?"pMT")
seu_obj?<-?PercentageFeatureSet(seu_obj,?pattern?=?"^HBA|^HBB",?col.name?=?"pHB")
seu_obj?<-?PercentageFeatureSet(seu_obj,?pattern?=?"^RPS|^RPL",?col.name?=?"pRP")

qcparams?<-?c("nFeature_RNA",?"nCount_RNA",?"pMT",?"pHB",?"pRP")
for?(i?in?seq_along(qcparams)){
??print(VlnPlot(object?=?seu_obj,?features?=?qcparams[i],?group.by?=?"orig.ident",?pt.size?=?0))
}
for?(i?in?seq_along(qcparams)){
??print(RidgePlot(object?=?seu_obj,?features?=?qcparams[i],?group.by?=?"orig.ident"))
}
###?批量看計(jì)算結(jié)果,很不錯
VlnPlot(seu_obj,?features?=?c("nFeature_RNA",?"nCount_RNA",?"pMT"),?pt.size?=?0,?group.by?=?"orig.ident",?ncol?=?1,?log?=?T)
ggsave2("QC.pdf",?path?=?"./result_3v3/",?width?=?20,?height?=?20,?units?=?"cm")
###?清空環(huán)境

remove(seu_obj15)
remove(seu_obj16)
remove(seu_obj17)
remove(seu_obj18)
remove(seu_obj19)
remove(seu_obj20)


remove(scs_data15)
remove(scs_data16)
remove(scs_data17)
remove(scs_data18)
remove(scs_data19)
remove(scs_data20)


#?過濾
#?定義兩個過濾作圖函數(shù)
qc_std_plot_helper?<-?function(x)?x?+?
??scale_color_viridis()?+
??geom_point(size?=?0.01,?alpha?=?0.3)

qc_std_plot?<-?function(seu_obj)?{
??qc_data?<-?as_tibble(FetchData(seu_obj,?c("nCount_RNA",?"nFeature_RNA",?"pMT",?"pHB",?"pRP")))
??plot_grid(

????qc_std_plot_helper(ggplot(qc_data,?aes(log2(nCount_RNA),?log2(nFeature_RNA),?color?=?pMT)))?+?
??????geom_hline(yintercept?=?log2(nFeature_lower),?color?=?"red",?linetype?=?2)?+
??????geom_hline(yintercept?=?log2(nFeature_upper),?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nCount_lower),?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nCount_upper),?color?=?"red",?linetype?=?2),
????qc_std_plot_helper(ggplot(qc_data,?aes(log2(nCount_RNA),?log2(nFeature_RNA),?color?=?pHB)))?+?
??????geom_hline(yintercept?=?log2(nFeature_lower),?color?=?"red",?linetype?=?2)?+
??????geom_hline(yintercept?=?log2(nFeature_upper),?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nCount_lower),?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nCount_upper),?color?=?"red",?linetype?=?2),
????qc_std_plot_helper(ggplot(qc_data,?aes(log2(nCount_RNA),?log2(nFeature_RNA),?color?=?pRP)))?+?
??????geom_hline(yintercept?=?log2(nFeature_lower),?color?=?"red",?linetype?=?2)?+
??????geom_hline(yintercept?=?log2(nFeature_upper),?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nCount_lower),?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nCount_upper),?color?=?"red",?linetype?=?2),

????qc_std_plot_helper(ggplot(qc_data,?aes(log2(nCount_RNA),?pMT,?color?=?nFeature_RNA)))?+?
??????geom_hline(yintercept?=?pMT_lower,?color?=?"red",?linetype?=?2)?+
??????geom_hline(yintercept?=?pMT_upper,?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nCount_lower),?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nCount_upper),?color?=?"red",?linetype?=?2),
????qc_std_plot_helper(ggplot(qc_data,?aes(log2(nCount_RNA),?pHB,?color?=?nFeature_RNA)))?+?
??????geom_hline(yintercept?=?pHB_lower,?color?=?"red",?linetype?=?2)?+
??????geom_hline(yintercept?=?pHB_upper,?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nCount_lower),?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nCount_upper),?color?=?"red",?linetype?=?2),
????qc_std_plot_helper(ggplot(qc_data,?aes(log2(nCount_RNA),?pRP,?color?=?nFeature_RNA)))?+?
??????geom_vline(xintercept?=?log2(nCount_lower),?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nCount_upper),?color?=?"red",?linetype?=?2),


????qc_std_plot_helper(ggplot(qc_data,?aes(log2(nFeature_RNA),?pMT,?color?=?nCount_RNA)))?+?
??????geom_hline(yintercept?=?pMT_lower,?color?=?"red",?linetype?=?2)?+
??????geom_hline(yintercept?=?pMT_upper,?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nFeature_lower),?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nFeature_upper),?color?=?"red",?linetype?=?2),
????qc_std_plot_helper(ggplot(qc_data,?aes(log2(nFeature_RNA),?pHB,?color?=?nCount_RNA)))?+?
??????geom_hline(yintercept?=?pHB_lower,?color?=?"red",?linetype?=?2)?+
??????geom_hline(yintercept?=?pHB_upper,?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nFeature_lower),?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nFeature_upper),?color?=?"red",?linetype?=?2),
????qc_std_plot_helper(ggplot(qc_data,?aes(log2(nFeature_RNA),?pRP,?color?=?nCount_RNA)))?+?
??????geom_vline(xintercept?=?log2(nFeature_lower),?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nFeature_upper),?color?=?"red",?linetype?=?2),

????qc_std_plot_helper(ggplot(qc_data,?aes(pRP,?pMT,?color?=?nCount_RNA)))?+?
??????geom_hline(yintercept?=?pMT_lower,?color?=?"red",?linetype?=?2)?+
??????geom_hline(yintercept?=?pMT_upper,?color?=?"red",?linetype?=?2),
????qc_std_plot_helper(ggplot(qc_data,?aes(pRP,?pMT,?color?=?nFeature_RNA)))?+?
??????geom_hline(yintercept?=?pMT_lower,?color?=?"red",?linetype?=?2)?+
??????geom_hline(yintercept?=?pMT_upper,?color?=?"red",?linetype?=?2),


????ggplot(gather(qc_data,?key,?value),?aes(key,?value))?+
??????geom_violin()?+
??????facet_wrap(~key,?scales?=?"free",?ncol?=?5),

????ncol?=?3,?align?=?"hv"
??)
}


##?過濾前

seu_obj_unfiltered?<-?seu_obj

#?作圖,有點(diǎn)慢,但是一般可以出來
qc_std_plot(seu_obj_unfiltered)

ggsave2("before_filtering.pdf",?path?=?"./result_3v3/",?width?=?30,?height?=?30,?units?=?"cm")
##?過濾
seu_obj?<-?subset(seu_obj_unfiltered,?subset?=?nFeature_RNA?>?nFeature_lower?&?nFeature_RNA?<?nFeature_upper?&?nCount_RNA?>?nCount_lower?&?nCount_RNA?<?nCount_upper?&?pMT?<?pMT_upper?&?pHB?<?pHB_upper)

#有點(diǎn)慢,同樣可以出
qc_std_plot(seu_obj)
ggsave2("after_filtering.pdf",?path?=?"./result_3v3/",?width?=?30,?height?=?30,?units?=?"cm")

#?看過濾了多少
seu_obj_unfiltered
seu_obj
#####################?SCTransform提供了去除深度的策略###############################
#利用seurat的SCTransform消除深度差異,非常推薦的標(biāo)準(zhǔn)化的方法,之后保存下次用!
seu_obj?<-?SCTransform(seu_obj,?verbose?=?T,?vars.to.regress?=?c("nCount_RNA",?"pMT"),?conserve.memory?=?T)

saveRDS(seu_obj,?file?=?"scRNA_SCTransform.RDS")

后記

關(guān)注 生信作曲家 wx,學(xué)習(xí)更多

本文使用 文章同步助手 同步

最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容