
前言
生信作曲家一直致力于構(gòu)建國內(nèi)和諧、開放、可持續(xù)的生物信息交流平臺。生信作曲家的目標(biāo)是為每個科研人掃清科研路上的一切障礙,讓生物信息學(xué)人人可做。目前,生信作曲家已推出"跟著一區(qū)文章學(xué)通單細(xì)胞"(上輯更新完畢),CIBERSORTx, NTP, TICPE,VIPER算法等多種強(qiáng)力教程,開辦生信中秋分享會、小年夜分享會、暑期集訓(xùn)營和冬季集訓(xùn)營等多次。機(jī)器學(xué)習(xí)從入門到精通等板塊也更新在即。
眾所周知,單細(xì)胞系列教程在全網(wǎng)已經(jīng)很多。相比于其他教程,生信作曲家希望為大家?guī)?strong>基于科研問題的,帶著科研思考的單細(xì)胞轉(zhuǎn)錄組全套教程。這是本教程最具特色的之處,也是為什么你需要學(xué)習(xí)的原因。
本套教程來源于這篇今年2021年12月發(fā)表在Oncogene上的文章。?doi: 10.1038/s41388-021-02054-3。之所以選擇oncogene而不是CNS級別的刊物,是因?yàn)橐幌騺淼膶?shí)踐告訴我們,往往這些口碑較好的一區(qū)文章的思路、技術(shù)、格局,恰恰是我們普通科研人能最大化汲取的。
2022年伊始的第一篇文章,就要帶領(lǐng)大家學(xué)會如何讀入單細(xì)胞數(shù)據(jù),以及預(yù)處理。示例數(shù)據(jù)閱讀原文,密碼pi8t
本節(jié)內(nèi)容模式相對比較固定。大家按部就班跑就可以
代碼實(shí)操
#####################預(yù)處理
#?加載R包
library(Seurat)
library(dplyr)
library(reticulate)
library(sctransform)
library(cowplot)
library(ggplot2)
library(viridis)
library(tidyr)
library(magrittr)
library(reshape2)
library(readxl)
library(progeny)
library(readr)
library(stringr)
#設(shè)定閾值
nFeature_lower?<-?500
nFeature_upper?<-?10000
nCount_lower?<-?1000
nCount_upper?<-?100000
pMT_lower?<-?0
pMT_upper?<-?30
pHB_lower?<-?0
pHB_upper?<-?5
theme_set(theme_cowplot())
#設(shè)定配色方案
use_colors?<-?c(
??Tumor?=?"brown2",
??Normal?=?"deepskyblue2",
??G1?=?"#46ACC8",
??G2M?=?"#E58601",
??S?=?"#B40F20",
??Epithelial?=?"seagreen",
??Immune?=?"darkgoldenrod2",
??Stromal?=?"steelblue",
??p018?=?"#E2D200",
??p019?=?"#46ACC8",
??p023?=?"#E58601",
??p024?=?"#B40F20",
??p027?=?"#0B775E",
??p028?=?"#E1BD6D",
??p029?=?"#35274A",
??p030?=?"#F2300F",
??p031?=?"#7294D4",
??p032?=?"#5B1A18",
??p033?=?"#9C964A",
??p034?=?"#FD6467")
#?加載數(shù)據(jù)和質(zhì)量控制
###工作目錄根據(jù)自己的文件夾做一定調(diào)整,例如../,./data/
###?樣本列表,利用read_excel
samples?<-?read_excel("./metadata/patients_metadata.xlsx",?range?=?cell_cols("A:A"))?%>%?.$sample_id
###?批量載入標(biāo)準(zhǔn)的cellranger文件,閱讀原文獲取數(shù)據(jù),為了大家運(yùn)行便利,我們讀6個腺癌進(jìn)來,3正常,3腫瘤
###?為什么我知道是腺癌請看metadata.excel文件
###?15:20用于挑選出sample的范圍
###?使用paste0函數(shù)進(jìn)行準(zhǔn)確定位到某個文件夾
###?注意read10X的三聯(lián)文件,matrix,features,barcodes
###?cellranger為公司返還的一般形式,這里對完全0基礎(chǔ)的同學(xué)而言是最大的理解障礙
for?(i?in?seq_along(samples)[15:20]){
??assign(paste0("scs_data",?i),?Read10X(data.dir?=?paste0("./cellranger/",?samples[i],?"/filtered_feature_bc_matrix")))
}
###?創(chuàng)建seurat對象,也是創(chuàng)建6個對象
for?(i?in?seq_along(samples)[15:20]){
??assign(paste0("seu_obj",?i),?CreateSeuratObject(counts?=?eval(parse(text?=?paste0("scs_data",?i))),?project?=?samples[i],?min.cells?=?3))
}
###?merge一鍵融合,從15-20個樣本
seu_obj?<-?merge(seu_obj15,?y?=?c(seu_obj16,?seu_obj17,?seu_obj18,?seu_obj19,?seu_obj20),?add.cell.ids?=?samples[15:20],?project?=?"lung")
###?計(jì)算線粒體等基因的比例
seu_obj?<-?PercentageFeatureSet(seu_obj,?pattern?=?"^MT-",?col.name?=?"pMT")
seu_obj?<-?PercentageFeatureSet(seu_obj,?pattern?=?"^HBA|^HBB",?col.name?=?"pHB")
seu_obj?<-?PercentageFeatureSet(seu_obj,?pattern?=?"^RPS|^RPL",?col.name?=?"pRP")
qcparams?<-?c("nFeature_RNA",?"nCount_RNA",?"pMT",?"pHB",?"pRP")
for?(i?in?seq_along(qcparams)){
??print(VlnPlot(object?=?seu_obj,?features?=?qcparams[i],?group.by?=?"orig.ident",?pt.size?=?0))
}
for?(i?in?seq_along(qcparams)){
??print(RidgePlot(object?=?seu_obj,?features?=?qcparams[i],?group.by?=?"orig.ident"))
}
###?批量看計(jì)算結(jié)果,很不錯
VlnPlot(seu_obj,?features?=?c("nFeature_RNA",?"nCount_RNA",?"pMT"),?pt.size?=?0,?group.by?=?"orig.ident",?ncol?=?1,?log?=?T)
ggsave2("QC.pdf",?path?=?"./result_3v3/",?width?=?20,?height?=?20,?units?=?"cm")###?清空環(huán)境
remove(seu_obj15)
remove(seu_obj16)
remove(seu_obj17)
remove(seu_obj18)
remove(seu_obj19)
remove(seu_obj20)
remove(scs_data15)
remove(scs_data16)
remove(scs_data17)
remove(scs_data18)
remove(scs_data19)
remove(scs_data20)
#?過濾
#?定義兩個過濾作圖函數(shù)
qc_std_plot_helper?<-?function(x)?x?+?
??scale_color_viridis()?+
??geom_point(size?=?0.01,?alpha?=?0.3)
qc_std_plot?<-?function(seu_obj)?{
??qc_data?<-?as_tibble(FetchData(seu_obj,?c("nCount_RNA",?"nFeature_RNA",?"pMT",?"pHB",?"pRP")))
??plot_grid(
????qc_std_plot_helper(ggplot(qc_data,?aes(log2(nCount_RNA),?log2(nFeature_RNA),?color?=?pMT)))?+?
??????geom_hline(yintercept?=?log2(nFeature_lower),?color?=?"red",?linetype?=?2)?+
??????geom_hline(yintercept?=?log2(nFeature_upper),?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nCount_lower),?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nCount_upper),?color?=?"red",?linetype?=?2),
????qc_std_plot_helper(ggplot(qc_data,?aes(log2(nCount_RNA),?log2(nFeature_RNA),?color?=?pHB)))?+?
??????geom_hline(yintercept?=?log2(nFeature_lower),?color?=?"red",?linetype?=?2)?+
??????geom_hline(yintercept?=?log2(nFeature_upper),?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nCount_lower),?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nCount_upper),?color?=?"red",?linetype?=?2),
????qc_std_plot_helper(ggplot(qc_data,?aes(log2(nCount_RNA),?log2(nFeature_RNA),?color?=?pRP)))?+?
??????geom_hline(yintercept?=?log2(nFeature_lower),?color?=?"red",?linetype?=?2)?+
??????geom_hline(yintercept?=?log2(nFeature_upper),?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nCount_lower),?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nCount_upper),?color?=?"red",?linetype?=?2),
????qc_std_plot_helper(ggplot(qc_data,?aes(log2(nCount_RNA),?pMT,?color?=?nFeature_RNA)))?+?
??????geom_hline(yintercept?=?pMT_lower,?color?=?"red",?linetype?=?2)?+
??????geom_hline(yintercept?=?pMT_upper,?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nCount_lower),?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nCount_upper),?color?=?"red",?linetype?=?2),
????qc_std_plot_helper(ggplot(qc_data,?aes(log2(nCount_RNA),?pHB,?color?=?nFeature_RNA)))?+?
??????geom_hline(yintercept?=?pHB_lower,?color?=?"red",?linetype?=?2)?+
??????geom_hline(yintercept?=?pHB_upper,?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nCount_lower),?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nCount_upper),?color?=?"red",?linetype?=?2),
????qc_std_plot_helper(ggplot(qc_data,?aes(log2(nCount_RNA),?pRP,?color?=?nFeature_RNA)))?+?
??????geom_vline(xintercept?=?log2(nCount_lower),?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nCount_upper),?color?=?"red",?linetype?=?2),
????qc_std_plot_helper(ggplot(qc_data,?aes(log2(nFeature_RNA),?pMT,?color?=?nCount_RNA)))?+?
??????geom_hline(yintercept?=?pMT_lower,?color?=?"red",?linetype?=?2)?+
??????geom_hline(yintercept?=?pMT_upper,?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nFeature_lower),?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nFeature_upper),?color?=?"red",?linetype?=?2),
????qc_std_plot_helper(ggplot(qc_data,?aes(log2(nFeature_RNA),?pHB,?color?=?nCount_RNA)))?+?
??????geom_hline(yintercept?=?pHB_lower,?color?=?"red",?linetype?=?2)?+
??????geom_hline(yintercept?=?pHB_upper,?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nFeature_lower),?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nFeature_upper),?color?=?"red",?linetype?=?2),
????qc_std_plot_helper(ggplot(qc_data,?aes(log2(nFeature_RNA),?pRP,?color?=?nCount_RNA)))?+?
??????geom_vline(xintercept?=?log2(nFeature_lower),?color?=?"red",?linetype?=?2)?+
??????geom_vline(xintercept?=?log2(nFeature_upper),?color?=?"red",?linetype?=?2),
????qc_std_plot_helper(ggplot(qc_data,?aes(pRP,?pMT,?color?=?nCount_RNA)))?+?
??????geom_hline(yintercept?=?pMT_lower,?color?=?"red",?linetype?=?2)?+
??????geom_hline(yintercept?=?pMT_upper,?color?=?"red",?linetype?=?2),
????qc_std_plot_helper(ggplot(qc_data,?aes(pRP,?pMT,?color?=?nFeature_RNA)))?+?
??????geom_hline(yintercept?=?pMT_lower,?color?=?"red",?linetype?=?2)?+
??????geom_hline(yintercept?=?pMT_upper,?color?=?"red",?linetype?=?2),
????ggplot(gather(qc_data,?key,?value),?aes(key,?value))?+
??????geom_violin()?+
??????facet_wrap(~key,?scales?=?"free",?ncol?=?5),
????ncol?=?3,?align?=?"hv"
??)
}
##?過濾前
seu_obj_unfiltered?<-?seu_obj
#?作圖,有點(diǎn)慢,但是一般可以出來
qc_std_plot(seu_obj_unfiltered)
ggsave2("before_filtering.pdf",?path?=?"./result_3v3/",?width?=?30,?height?=?30,?units?=?"cm")##?過濾
seu_obj?<-?subset(seu_obj_unfiltered,?subset?=?nFeature_RNA?>?nFeature_lower?&?nFeature_RNA?<?nFeature_upper?&?nCount_RNA?>?nCount_lower?&?nCount_RNA?<?nCount_upper?&?pMT?<?pMT_upper?&?pHB?<?pHB_upper)
#有點(diǎn)慢,同樣可以出
qc_std_plot(seu_obj)
ggsave2("after_filtering.pdf",?path?=?"./result_3v3/",?width?=?30,?height?=?30,?units?=?"cm")
#?看過濾了多少
seu_obj_unfiltered
seu_obj#####################?SCTransform提供了去除深度的策略###############################
#利用seurat的SCTransform消除深度差異,非常推薦的標(biāo)準(zhǔn)化的方法,之后保存下次用!
seu_obj?<-?SCTransform(seu_obj,?verbose?=?T,?vars.to.regress?=?c("nCount_RNA",?"pMT"),?conserve.memory?=?T)
saveRDS(seu_obj,?file?=?"scRNA_SCTransform.RDS")后記
關(guān)注 生信作曲家 wx,學(xué)習(xí)更多
本文使用 文章同步助手 同步