day9-1 單細胞數(shù)據(jù)的二次分群

day9-1 單細胞數(shù)據(jù)的二次分群

1.背景

Seurat里的FindClusters函數(shù)設置的resolution數(shù)值越大,分群的數(shù)量就越多,但是當單細胞數(shù)量太多的時候,會遇到resolution再變大,分群的數(shù)量也不再增加的情況。一次分群分不開,但又想要細分時,就會需要二次分群。

2.加載數(shù)據(jù)

rm(list = ls())
library(Seurat)
library(dplyr)
load("sce.Rdata")
seu.obj = sce
head(seu.obj@meta.data)
p1 = DimPlot(seu.obj, reduction = "umap",label=T)+NoLegend()
p1

3.二次分群

這里以成纖維細胞為例進行二次分群。想要切換別的細胞類型直接修改下面的my_sub即可。
核心就是提取感興趣的亞群的細胞組成的Seurat對象,后面就是標準流程和可視化了,沒有任何區(qū)別。

my_sub = "Fibroblasts"
sub.cells <- subset(seu.obj, idents = my_sub)
f = "obj.Rdata"
if(!file.exists(f)){
  sub.cells = sub.cells %>%
    NormalizeData() %>%
    FindVariableFeatures() %>%
    ScaleData(features = rownames(.)) %>%
    RunPCA(features = VariableFeatures(.))  %>%
    FindNeighbors(dims = 1:15) %>%
    FindClusters(resolution = 0.5) %>%
    RunUMAP(dims = 1:15) 
  save(sub.cells,file = f)
}
load(f)
DimPlot(sub.cells, reduction = 'umap',label = T)+NoLegend()
sub.cells@meta.data$celltype = paste0("M",sub.cells$seurat_clusters) #把子對象的亞群注釋結果添加到表格上面去。
save(sub.cells,file = "sub.cells.Rdata")
image.png

4.marker基因及其可視化

#找亞群間的差異基因
sub.cells.markers <- FindAllMarkers(sub.cells, only.pos = TRUE,  
                                    min.pct = 0.25, logfc.threshold = 0.25)

top10 <- sub.cells.markers %>% 
  group_by(cluster) %>% 
  top_n(n = 10, wt = avg_log2FC) %>% 
  pull(gene);top10
# [1] "PLN"      "C2orf40"  "NDUFA4L2" "TRIO"     "CBLB"     "ADIRF"   
 #[7] "TAGLN"    "TINAGL1"  "PPP1R14A" "MYH11"    "TNFAIP6"  "MEIS1"   
#[13] "ADGRD1"   "EMILIN2"  "ROR1"     "ADAMTS15" "ITGBL1"   "BDKRB1"  
#[19] "RNF217"   "SLPI"

導出的基因結果不大對。。。

#marker基因的5種可視化
VlnPlot(sub.cells, features = top10)
image.png
RidgePlot(sub.cells, features = top10)
image.png
FeaturePlot(sub.cells, features = top10)
image.png
DotPlot(sub.cells,features = top10)+ RotatedAxis()
image.png
DoHeatmap(sub.cells, features = top10) + NoLegend()
image.png

5.放回原來的Seurat對象里面

5.1 R語言基礎知識補充?match

match函數(shù)時匹配,用于找出一個向量中的元素在另一個向量中的對應位置,

x <- c("f","b")
y <- c("b","c","f")
# 使用match函數(shù)找到x在y中的位置
match(x, y)
## [1] 3 1
##x里的兩個元素f和b分別在y的第3和第1個位置
5.2 放回原來的Seurat對象里
seu.obj$celltype = as.character(Idents(seu.obj))
seu.obj$celltype = ifelse(seu.obj$celltype==my_sub,
       sub.cells$celltype[match(colnames(seu.obj),colnames(sub.cells))],
       seu.obj$celltype)

使用ifelse函數(shù)實現(xiàn)了分情況討論:判斷seu.obj的每個細胞是否是my_sub(本例是Fibroblasts),如果是的話,就替換成sub.cells里面匹配的每個細胞對應的celltype,也就是M0和M1;否則就不替換,還是原來的celltype。

seu.obj$celltype = as.character(Idents(seu.obj))
seu.obj$celltype = ifelse(seu.obj$celltype==my_sub,
                          sub.cells$celltype[match(colnames(seu.obj),colnames(sub.cells))],
                          seu.obj$celltype)
Idents(seu.obj) = seu.obj$celltype
p2 = DimPlot(seu.obj,label = T)+NoLegend()
p1+p2
image.png

day9-2 單樣本細胞周期

根據(jù)在各個周期高表達的基因來計算細胞周期評分,根據(jù)評分的高低來推斷細胞屬于什么周期。

1.讀取數(shù)據(jù)并做好前期的質控

1.1 前面使用的數(shù)據(jù)
rm(list = ls())
library(tidyverse)
library(Seurat)
ct = Read10X("input/")
seu.obj <- CreateSeuratObject(counts = ct, 
                              min.cells = 3, 
                              min.features = 200)
set.seed(1234)
seu.obj = subset(seu.obj,downsample = 3000)
seu.obj[["percent.mt"]] <- PercentageFeatureSet(seu.obj, pattern = "^MT-")
VlnPlot(seu.obj, 
        features = c("nFeature_RNA",
                     "nCount_RNA", 
                     "percent.mt"), 
        ncol = 3,pt.size = 0.5)
seu.obj = subset(seu.obj,
                 nFeature_RNA < 6000 &
                   nCount_RNA < 30000 &
                   percent.mt < 18)
1.2 marrow

這個數(shù)據(jù)來自Seurat 的細胞周期Vignette,是一個嚴重受到細胞周期影響的數(shù)據(jù)。

https://satijalab.org/seurat/articles/cell_cycle_vignette
加載

exp.mat <- read.delim("nestorawa_forcellcycle_expressionMatrix.txt",row.names = 1)
marrow <- CreateSeuratObject(counts = exp.mat,
                             project = "b",
                             min.cells = 3, 
                             min.features = 200)
marrow[["percent.mt"]] <- PercentageFeatureSet(marrow, pattern = "^MT-")
head(marrow@meta.data, 3)

VlnPlot(marrow, 
        features = c("nFeature_RNA",
                     "nCount_RNA", 
                     "percent.mt"), 
        ncol = 3,pt.size = 0.5)
image.png

2.計算細胞周期評分

CellCycleScoring就是計算細胞周期評分的函數(shù)。

check_cc =  function(ob){
s.genes <- intersect(cc.genes$s.genes,rownames(ob))
g2m.genes <- intersect(cc.genes$g2m.genes,rownames(ob))
ob = ob %>% 
  NormalizeData() %>%  
  FindVariableFeatures() %>%  
  CellCycleScoring(s.features = s.genes, 
                   g2m.features = g2m.genes) %>%
  ScaleData(features = rownames(.)) %>%  
  RunPCA(features = c(s.genes,g2m.genes))
return(ob)
}
ch1 = check_cc(seu.obj)
head(ch1@meta.data)
#                      orig.ident nCount_RNA nFeature_RNA percent.mt     S.Score   G2M.Score Phase
#AAACCCACAGTCGGTC-1 SeuratProject       4243         1256   6.292717 -0.02545080  0.05694222   G2M
#AAACGAAAGAATCGCG-1 SeuratProject       7307         2577   2.572875  0.03719043 -0.12520510     S
#AAAGAACAGCTTACGT-1 SeuratProject       8154         1881   4.083885  0.04363704 -0.02283322     S
#AAAGAACAGGTTCATC-1 SeuratProject       8223         2182   4.377964 -0.05930417 -0.03018975    G1
#AAAGAACAGTCTGTAC-1 SeuratProject       3884         1377   4.763131 -0.02251541  0.02168140   G2M
#AAAGAACTCCACCTCA-1 SeuratProject       3997         1307   3.402552  0.01273964 -0.13158839     S

3.比較兩個數(shù)據(jù)的細胞周期評分和PCA

這個函數(shù)處理完后,meta.data里面多了4列,分別是s和g2m的評分以及推斷的細胞周期。

table(ch1$Phase)
## 
##   G1  G2M    S 
## 1555  482  863
ch2 = check_cc(marrow)
table(ch2$Phase)
## 
##  G1 G2M   S 
## 279 183 312
PCAPlot(ch1,group.by = "Phase")+ PCAPlot(ch2,group.by = "Phase")

把坐標調(diào)到相同范圍。xlim和ylim是設置橫縱坐標范圍,要按需調(diào)整到把兩個圖的點都能放進去。例如圖1是橫坐標-60到5,縱坐標-6到5,圖二橫坐標范圍-10到15,縱坐標范圍-10到15,就設置為橫坐標-60到15,縱坐標-10到15。

library(patchwork)
PCAPlot(ch1,group.by = "Phase")+ 
  PCAPlot(ch2,group.by = "Phase")&
  xlim(-60,15)&
  ylim(-10,15)
image.png

再比較一下S.Score和G2M.Score

p1 = VlnPlot(ch1,"S.Score",group.by = "Phase")
p2 = VlnPlot(ch2,"S.Score",group.by = "Phase")
wrap_plots(p1,p2,nrow = 1) & ylim(-0.6,0.6)
image.png
p1 = VlnPlot(ch1,"G2M.Score",group.by = "Phase")
p2 = VlnPlot(ch2,"G2M.Score",group.by = "Phase")
wrap_plots(p1,p2,nrow = 1) & ylim(-0.5,1)
image.png

比較每一個細胞的S.Score和G2M.Score,就可以理解到每個細胞屬于什么細胞周期是怎么得出來的了:每個細胞都有S.score和G2M.score,如果S.score和G2M.score都是負的就判斷為G1期,否則,S.score高就是S期,G2M.score高就是G2M期??梢则炞C來理解:

table(ch1$Phase)
## 
##   G1  G2M    S 
## 1555  482  863
sum(ch1$S.Score<0 & ch1$G2M.Score <0) #G1
## [1] 1555
sum(ch1$S.Score>0 & ch1$S.Score > ch1$G2M.Score) #S
## [1] 863
sum(ch1$G2M.Score>0 & ch1$S.Score < ch1$G2M.Score) #G2M
## [1] 482

如果是大多數(shù)點都集中在0點附近的,就可以不用去除細胞周期的影響!,分布范圍較廣或者是有較多的離群值那就需要要去除。
讓我們來比較一下去除和不去除細胞周期影響的下游注釋看有沒有區(qū)別吧。

4.比較去除和不去處細胞周期影響的下游注釋

4.1 不考慮細胞周期的降維聚類分群
f = "ob1.Rdata"
if(!file.exists(f)){
  ob1 = seu.obj %>% 
  NormalizeData() %>%  
  FindVariableFeatures() %>%  
  ScaleData(features = rownames(.)) %>%  
  RunPCA(features = VariableFeatures(.))  %>%
  FindNeighbors(dims = 1:15) %>% 
  FindClusters(resolution = 0.5) %>% 
  RunUMAP(dims = 1:15) %>% 
  RunTSNE(dims = 1:15)
  save(ob1,file = f)
}
load(f)
4.2 考慮細胞周期的降維聚類分群

其實是根據(jù)s期和g2m期各自有代表性的基因來打分,然后在ScaleData函數(shù)中加上vars.to.regress參數(shù)來消除影響。
這里的cc.genes是Seurat包里自帶的數(shù)據(jù),無需任何賦值或加載的操作,可以直接使用。

s.genes <- intersect(cc.genes$s.genes,rownames(seu.obj))
g2m.genes <- intersect(cc.genes$g2m.genes,rownames(seu.obj))
f = "ob2.Rdata"
if(!file.exists(f)){
  ob2 = seu.obj %>% 
  NormalizeData() %>%  
  FindVariableFeatures() %>%  
  CellCycleScoring(s.features = s.genes, g2m.features = g2m.genes) %>%
  ScaleData(vars.to.regress = c("S.Score", "G2M.Score"),features = rownames(.)) %>%  #運行極其慢
  RunPCA(features = VariableFeatures(.))  %>%
  FindNeighbors(dims = 1:15) %>% 
  FindClusters(resolution = 0.5) %>% 
  RunUMAP(dims = 1:15) %>% 
  RunTSNE(dims = 1:15)
  save(ob2,file = f)
}
load(f)
p1 <- DimPlot(ob1, reduction = "umap",label = T)+NoLegend()
p2 <- DimPlot(ob2, reduction = "umap",label = T)+NoLegend()
p1+p2
image.png

用singleR來注釋

library(celldex)
library(SingleR)
ls("package:celldex")
f = "ref_BlueprintEncode.RData"
if(!file.exists(f)){
  ref <- celldex::HumanPrimaryCellAtlasData()
  save(ref,file = f)
}
ref <- get(load(f))
library(BiocParallel)
scRNA = ob1
test = scRNA@assays$RNA$data
pred.scRNA <- SingleR(test = test, 
                      ref = ref,
                      labels = ref$label.main, 
                      clusters = scRNA@active.ident)
new.cluster.ids <- pred.scRNA$pruned.labels
names(new.cluster.ids) <- levels(scRNA)
scRNA <- RenameIdents(scRNA,new.cluster.ids)
p3 <- DimPlot(scRNA, reduction = "umap",label = T,pt.size = 0.5) + NoLegend()
m = scRNA
scRNA = ob2
test = scRNA@assays$RNA$data
pred.scRNA <- SingleR(test = test, 
                      ref = ref,
                      labels = ref$label.main, 
                      clusters = scRNA@active.ident)
new.cluster.ids <- pred.scRNA$pruned.labels
names(new.cluster.ids) <- levels(scRNA)
scRNA <- RenameIdents(scRNA,new.cluster.ids)
p4 <- DimPlot(scRNA, reduction = "umap",label = T,pt.size = 0.5) + NoLegend()
p3+p4
image.png
table(as.character(Idents(m))==as.character(Idents(scRNA))) 
## FALSE  TRUE 
##125  2749 

去不去除當然還是有區(qū)別的,如果差太多那就采用去除后的結果,就是比較費計算資源。
結果還是不大一樣

最后編輯于
?著作權歸作者所有,轉載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務。

相關閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容