單細胞流程復現(xiàn)

最近有點時間,打算復現(xiàn)一篇單細胞的文章。記于2023/03/28
參照文章:Single-cell RNA landscape of the osteoimmunology microenvironment in periodontitis (thno.org)

大概就是對牙周炎組織做了一個單細胞測序;樣本有HCs組(four healthy controls)、PDs( five patients with severe chronic periodontitis )、PDTs(three patients with severe chronic periodontitis after initial periodontal therapy within 1 month )。共計12個文庫樣本。由于是同一個組織不同處理組,所以我的基本處理思路:
按原始流程跑下來之后,根據(jù)不同的組的細胞降維聚類結果判斷是否具有批次效應;在去除批次效應之后,又重新降維聚類,并走完接下來的Clsuter注釋等。

直接從下載該文章的GEO數(shù)據(jù)

這里我下載的是GSE171213_RAW.tar,后面需要自己整合。當然也可以下載文章的原始數(shù)據(jù),使用cellranger從頭跑一遍,具體流程代碼示例:單細胞實戰(zhàn)(四) Cell Ranger流程概覽 (qq.com)

下載并解壓GSE171213_RAW.tar文件后,得到

tar -xvf GSE171213_RAW.tar
#GSM5220920_HC1.cellname.list.txt
#GSM5220920_HC1.counts.tsv
#GSM5220921_HC2.cellname.list.txt
#GSM5220921_HC2.counts.tsv
#GSM5220922_HC3.cellname.list.txt
#GSM5220922_HC3.counts.tsv
#GSM5220923_HC4.cellname.list.txt
#GSM5220923_HC4.counts.tsv
#GSM5220924_PD1.cellname.list.txt
#GSM5220924_PD1.counts.tsv
#GSM5220925_PD2.cellname.list.txt
#GSM5220925_PD2.counts.tsv
#GSM5220926_PD3.cellname.list.txt
#GSM5220926_PD3.counts.tsv
#GSM5220927_PD4.cellname.list.txt
#GSM5220927_PD4.counts.tsv
#GSM5220928_PD5.cellname.list.txt
#GSM5220928_PD5.counts.tsv
#GSM5220929_PDT1.cellname.list.txt
#GSM5220929_PDT1.counts.tsv
#GSM5220930_PDT2.cellname.list.txt
#GSM5220930_PDT2.counts.tsv
#GSM5220931_PDT3.cellname.list.txt
#GSM5220931_PDT3.counts.tsv

創(chuàng)建Seurat Object對象并整合數(shù)據(jù)

##大致流程:讀取--創(chuàng)建SeuratObject對象--數(shù)據(jù)質控--標準化、識別高邊基因、歸一化表達矩陣--降維聚類--類型注釋
library(Seurat)
library(dplyr)
library(edgeR)
library(stringr)
library(patchwork)
library(ggpubr)
rm(list = ls())


#set.seed(12123) #這里最好設置隨機種子,保證每次操作的可重復性
folders = list.files(pattern = "*tsv")
samples_name = strsplit(folders, "_") %>% sapply(function(x)x[2]) %>% strsplit("\\.") %>% sapply(function(x)x[1])
##samples_name : "HC1"  "HC2"  "HC3"  "HC4"  "PD1"  "PD2"  "PD3"  "PD4"  "PD5"  "PDT1" "PDT2" "PDT3"

##使用lapply函數(shù)對每一個組創(chuàng)建一個單獨的Seurat object對象。
sceList = lapply(folders,function(folder){ 
  CreateSeuratObject(counts = read.table(folder,header = T,sep = "\t",row.names = 1), 
                     min.cells = 60,min.features = 200,project = strsplit(folder, "_") %>% 
                       sapply(function(x)x[2]) %>% strsplit("\\.") %>% sapply(function(x)x[1]))
})

names(sceList) =samples_name  ##修改名稱

##對每個object的meta.data添加分組信息,例如:
sceList[[1]]@meta.data$group <- "HC"
sceList[[2]]@meta.data$group <- "HC"
sceList[[3]]@meta.data$group <- "HC"
sceList[[4]]@meta.data$group <- "HC"
sceList[[5]]@meta.data$group <- "PD"
sceList[[6]]@meta.data$group <- "PD"
sceList[[7]]@meta.data$group <- "PD"
sceList[[8]]@meta.data$group <- "PD"
sceList[[9]]@meta.data$group <- "PD"
sceList[[10]]@meta.data$group <- "PDT"
sceList[[11]]@meta.data$group <- "PDT"
sceList[[12]]@meta.data$group <- "PDT"

###整合
sce.big = merge(sceList[[1]],sceList[2:length(sceList)],
                add.cell.ids = samples_name)
save(sce.big,file = 'sce.big.merge.teeth.Rdata')  ##保存

###查看每個分組中的細胞數(shù)目
table(sce.big$orig.ident)
[result]:
HC1  HC2  HC3  HC4  PD1  PD2  PD3  PD4  PD5 PDT1 PDT2 PDT3 
3781 2637 4485 4534 4503 3676 4290 5398 4361 6677 8320 4387 

質控并初步判斷是否存在批次效應

###QC,這里我是用的文章的標準:每個細胞的線粒體含量:≤ 40%;每個細胞的基因數(shù)目:≥ 200
sce.big[["percent.mt"]] = PercentageFeatureSet(sce.big,pattern = "^MT-")
head(sce.big@meta.data,5)
sce.big = subset(sce.big,subset = nFeature_RNA>200 & percent.mt < 40) 
VlnPlot(sce.big,pt.size = 0,features = c ("nFeature_RNA","percent.mt"),ncol=2)

##查看是否有批次效應;這里使用的是Seurat的步驟:NormalizeData ==> FindVariableFeatures ==> ScaleData;
sce.big = NormalizeData(sce.big, normalization.method = "LogNormalize", scale.factor = 10000)%>%
  FindVariableFeatures (selection.method = "vst", nfeatures = 2000)%>%ScaleData()
sce.big = RunPCA(sce.big,verbose = F)
ElbowPlot(sce.big, ndims = 50)  ##用于判斷接下來使用的Dims個數(shù)
pc.num=1:30
sce.big = sce.big %>% RunTSNE(dims=pc.num)%>% RunUMAP(dims = pc.num)
sce.big = sce.big %>% FindNeighbors(dims = pc.num)%>% FindClusters()
DimPlot(sce.big,label = T)
DimPlot(sce.big,group.by = "orig.ident")
DimPlot(sce.big,group.by = "group")
DimPlot(sce.big,group.by = "orig.ident",split.by = "orig.ident",ncol=4)
Fig1.png

這么一看,未作CCA和harmony前,未出現(xiàn)明顯的批次效應。盡管如此,我們還是走一遍去批次的流程。常用的方法有Seurat包的CCA以及Harmony,相比而言,CCA容易出現(xiàn)過矯正的情況,這里我們使用Harmony(根據(jù)文章的來)

去除批次效應

###去除批次效應
#使用harmony去除批次效應
cellinfo = subset(sce.big@meta.data,select = c("orig.ident","group","nCount_RNA","nFeature_RNA"))   
scRNA = CreateSeuratObject(sce.big@assays$RNA@counts,meta.data = cellinfo)#重新創(chuàng)建Serurat object
#這里我們使用SCTranscform代替Seurat流程中的NormalizeData ==> FindVariableFeatures ==> ScaleData步驟
scRNA <- SCTransform(sce.big)   
scRNA = RunPCA(scRNA,npcs=50,verbose=FALSE)
scRNA = RunHarmony(scRNA,group.by.vars = "orig.ident",
                   assay.use = "SCT",
                   max.iter.harmony = 20)  
##使用Harmony去除批次效應,批次的來源(group.by.vars)為orig.ident,矩陣(assay)為“SCT”
ElbowPlot(scRNA, ndims = 50)
pc.num=1:10
scRNA = RunTSNE(scRNA,reduction = "harmony",dims=pc.num)%>%
  RunUMAP(reduction="harmony",dims = pc.num)%>%FindNeighbors()%>%FindClusters(resolution = 0.5)
debatch1=DimPlot(scRNA,group.by = "orig.ident")
debatch2=DimPlot(scRNA,group.by = "group",split.by = "group",ncol=4)
debatch3=DimPlot(scRNA,label = TRUE)    ##可以看到使用resolution=0.5的時候,cluster有28個,該值是否合適有待商榷!

design = "AA
          BC"
wrap_plots(debatch2, debatch1, debatch3, design = design) 

##使用clustree查看不同resolution值的分群情況。
library(cluster)
library(clustree)
scRNA = FindClusters(scRNA,resolution =c(seq(.1,1.5,.2)))  #這里resolution從0.1到1.5,步長0.2,共計8個值。
clustree_plot = clustree(scRNA@meta.data, prefix = "SCT_snn_res.")
colnames(scRNA@meta.data) 
[result]:
1] "orig.ident"      "nCount_RNA"      "nFeature_RNA"    "group"           "nCount_SCT"      "nFeature_SCT"   
 [7] "SCT_snn_res.0.5" "seurat_clusters" "SCT_snn_res.0.1" "SCT_snn_res.0.4" "SCT_snn_res.0.6" "SCT_snn_res.0.8"
[13] "SCT_snn_res.1"   "SCT_snn_res.1.2" "SCT_snn_res.1.4" "SCT_snn_res.1.6" "SCT_snn_res.0.3" "SCT_snn_res.0.7"
[19] "SCT_snn_res.0.9" "SCT_snn_res.1.1" "SCT_snn_res.1.3" "SCT_snn_res.1.5"

Idents(scRNA) <- "SCT_snn_res.0.1"   ##從Clustree的結果看,我直接選擇resolution=0.1,但在寫這篇簡書的時候發(fā)現(xiàn)選擇resolution=0.3可能會更好。
DimPlot(scRNA,label = TRUE)  ##圖沒有貼出來,得到12個Clsuters
Fig2_1.png

Fig2_2_Clusree.png

從Clustree的結果看,我當時直接選擇resolution=0.1,但在寫這篇簡書的時候發(fā)現(xiàn)選擇resolution=0.3可能會更好,但后面流程都是resolution=0.1,就這樣吧,僅僅作為一個流程參考罷了。

尋找Marker基因并注釋。參考

##按照自己分析的,尋找不同cluster中的marker基因。
#如果是單個條件的單個樣本:使用以FindAllMarkers(),即將每個類群與所有其他類群進行比較,以確定潛在的marker genes
scRNA.markers= FindAllMarkers(scRNA,only.pos = TRUE,min.pct = 0.1,logfc.threshold=0.25,test.use = "wilcox")
top3 <- scRNA.markers %>% group_by(cluster) %>% top_n(n = 3, wt = avg_log2FC)   ##每個Cluster選擇3個top genes
Doheatmap = DoHeatmap(subset(scRNA,downsample = 300), features =top3$gene) + NoLegend()

#如果多個不同條件的樣本:使用FindConservedMarkers()尋找多個樣本中的保守基因作為markers genes
DefaultAssay(scRNA) <- "RNA"  ##這里我們使用原始計數(shù)而不是聚合數(shù)據(jù)。
cluster_marker_gene = mclapply(0:11, function(i){
                      cluster_i = FindConservedMarkers(scRNA, ident.1 = i, grouping.var = "group", verbose = FALSE) %>% as.data.frame()
                      cluster_i$ident = i
                      }, mc.cores = 12)
cluster_marker_gene_df = do.call(rbind, cluster_marker_gene)
head(cluster_marker_gene_df,3)
DoHeatmap.png

根據(jù)文章提供的marker gene作注釋#1

##按照文章的marker基因查看每個cluster中基因的表達情況。
markers.to.plot <- c("TRAC", "MS4A1", "IGHG1", "PECAM1", "FCGR3B", "MS4A6A", "COL1A1", "TPSAB1", "KRT6A","LTF")
#Dot圖
DotPlot(scRNA, features = markers.to.plot, cols = c("blue", "red"), dot.scale = 8) +RotatedAxis() 
#Feature圖
FeaturePlot(scRNA, features = markers.to.plot, max.cutoff = 3,cols = c("grey", "red"),min.cutoff = 'q9')
#小提琴圖
plot = VlnPlot(scRNA, features = markers.to.plot, pt.size = 0, combine = FALSE)
wrap_plots(plots = plot, ncol = 4)+plot_layout(guides = "collect")
DotPlot.png

VlnPlot.png

FeaturePlot.png

可以看到使用文章的marker對于Cluster的注釋還是不錯的,不同的marker 在不同的Cluster表達情況區(qū)分明顯。

根據(jù)文章提供的marker gene作注釋#2

scRNA <- RenameIdents(scRNA, 
                      `0` = "T cell", 
                      `1` = "Endotheliad",
                      `2` = "B cell",
                      `3` = "Neutrophil", 
                      `4` = "Plasma", `5` = "Monocytic", 
                      `6` = "Plasma", `7` = "Epitheial", 
                      `8` = "Plasma", `9` = "Mast cell",
                      `10` = "Fibroblasts", 
                      `11` = "unknow")
DimPlot(scRNA, label = TRUE,split.by = "group") 
DimPlot(scRNA, label=TRUE)
Cell_kind_landscape.split.png

復現(xiàn)文章中Boxplot圖,對每一個細胞Cluster統(tǒng)計不同處理組中細胞變化情況。

##畫不同的cluster在不同組的分布(boxplot)
# group.add = c(rep("HC",4),rep("PD",5),rep("PDT",3))
my_comparisons <- list(c("HC", "PD"), c("HC", "PDT"), c("PD", "PDT"))
cell <- list()
plot3 = list()

for (i in 1:length(cellkind)){
  cell_i = subset(scRNA,idents =cellkind[i])$orig.ident %>% table() %>% prop.table() %>% as.data.frame()
  cell_i$group = gsub('.{1}$', '', cell_i$.)
  plot3_i = ggplot(cell_i,aes(x=group,y=Freq,fill=group))+geom_boxplot()+theme_classic()+
    labs(title = cellkind[i])+xlab(NULL)+
    stat_compare_means(comparisons = my_comparisons,method = "t.test",label = "p.format",size = 2)
  cell[[i]] <- list(cell_i)
  plot3[[i]] = plot3_i
}

rm(plot3_i)
rm(cell_i)
wrap_plots(plots = plot3, ncol = 4)+plot_layout(guides = "collect")   ##整理列表的輸入和輸出
cluster_cell_group_boxplot.png

可以看到和文章中總體趨勢一致,但在P值和顯著性水平上仍是和文章有部分出入。

細胞軌跡分析

##細胞軌跡分析:packages安裝。
library(devtools)
install_github("velocyto-team/velocyto.R")
devtools::install_github('cole-trapnell-lab/monocle3')
報錯:
Error: Failed to install 'unknown package' from GitHub:
  An unknown option was passed in to libcurl

報錯的原因應該是Curl包版本的問題。服務器上的Rstudio server鏈接在public下的R版本,一些R包安裝起來就和系統(tǒng)的一些東西起沖突,如果是自己的環(huán)境下,一個conda直接解決,可惜root不在我。
-------記于2023/3/31 00:47

最后編輯于
?著作權歸作者所有,轉載或內容合作請聯(lián)系作者
【社區(qū)內容提示】社區(qū)部分內容疑似由AI輔助生成,瀏覽時請結合常識與多方信息審慎甄別。
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發(fā)布,文章內容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務。

相關閱讀更多精彩內容

友情鏈接更多精彩內容