########################################################
#-------------------------------------------------------
# Topic:小鼠單細胞數(shù)據(jù)跑SOM流程
# Author:Wang Haiquan
# Date:Fri Jun 12 09:54:57 2020
# Mail:mg1835020@smail.nju.edu.cn
#-------------------------------------------------------
########################################################
library(kohonen)
library(Seurat)
library(stringr)
library(ggplot2)
library(pheatmap)
sample_mouse<-readRDS("../20200604human_mouse/data/tang_mouse.rds")
dim(sample_mouse)
sample_mouse_group<-str_split(colnames(sample_mouse),"_",simplify = T)[,2]
names(sample_mouse_group)<-colnames(sample_mouse)
sample_mouse_group<-sample_mouse_group[grep("^RS",sample_mouse_group)]
table(sample_mouse_group)
sample_mouse<-sample_mouse[,names(sample_mouse_group)]
sample_mouse<-CreateSeuratObject(sample_mouse)
sample_mouse<-NormalizeData(sample_mouse)%>%ScaleData()%>%FindVariableFeatures(nfeatures=5000)
sample_mouse<-sample_mouse@assays$RNA@scale.data[VariableFeatures(sample_mouse),]
dim(sample_mouse)
#-------------------------------------------------------
#Function:進行PCA及相關(guān)性檢查
#-------------------------------------------------------
sample_mouse_pca<-prcomp(t(sample_mouse),scale. = F,center = F)
plot(sample_mouse_pca)
sample_mouse_pca<-sample_mouse_pca$x
sample_mouse_pca
ggplot(as.data.frame(sample_mouse_pca),aes(x=PC1,y=PC2,color=sample_mouse_group))+geom_point()
sample_anno_col<-data.frame(row.names = colnames(sample_mouse),cell_type=sample_mouse_group)
pheatmap(cor((sample_mouse)),show_rownames = F,show_colnames = F,annotation_col = sample_anno_col)
#-------------------------------------------------------
#Function:使用均值作為SOM的輸入
#-------------------------------------------------------
sample_mouse<-readRDS("../20200604human_mouse/data/tang_mouse.rds")
dim(sample_mouse)
sample_mouse_group<-str_split(colnames(sample_mouse),"_",simplify = T)[,2]
names(sample_mouse_group)<-colnames(sample_mouse)
sample_mouse_group<-sample_mouse_group[grep("^RS",sample_mouse_group)]
table(sample_mouse_group)
sample_mouse<-sample_mouse[,names(sample_mouse_group)]
sample_mouse<-CreateSeuratObject(sample_mouse)
sample_mouse<-NormalizeData(sample_mouse)%>%ScaleData()%>%FindVariableFeatures(nfeatures=5000)
sample_mouse@active.ident<-as.factor(sample_mouse_group)
sample_mouse<-AverageExpression(sample_mouse,features = VariableFeatures(sample_mouse))
sample_mouse<-sample_mouse$RNA
dim(sample_mouse)
sample_mouse<-t(scale(t(sample_mouse)))
#做SOM
mouse_som_grid<-somgrid(xdim = 20, ydim=20, topo="hexagonal")
mouse_som<-supersom(sample_mouse,mouse_som_grid,rlen = 500)
par(mfcol=c(2,2))
plot(mouse_som,"changes")
plot(mouse_som,"counts",palette.name = coolBlueHotRed)
plot(mouse_som,"codes",palette.name = coolBlueHotRed)
plot(mouse_som,"quality",palette.name = coolBlueHotRed)
par(mfcol=c(1,2))
plot(mouse_som,"mapping",palette.name = coolBlueHotRed)
plot(mouse_som,"dist.neighbours",palette.name = coolBlueHotRed)
par(mfcol=c(2,2))
plot(mouse_som,"property",property = getCodes(mouse_som)[,1],palette.name = coolBlueHotRed,main=colnames(getCodes(mouse_som))[1])
plot(mouse_som,"property",property = getCodes(mouse_som)[,2],palette.name = coolBlueHotRed,main=colnames(getCodes(mouse_som))[2])
plot(mouse_som,"property",property = getCodes(mouse_som)[,3],palette.name = coolBlueHotRed,main=colnames(getCodes(mouse_som))[3])
plot(mouse_som,"property",property = getCodes(mouse_som)[,4],palette.name = coolBlueHotRed,main=colnames(getCodes(mouse_som))[4])
#進行簇的劃分
#定義1為閾值,其它cluster為1
mouse_code<-getCodes(mouse_som)
mouse_code_cluster<-t(apply(mouse_code,1,function(x){ifelse(x>1,x,0)}))
mouse_code_cluster<-apply(mouse_code_cluster,1,function(x){ifelse(sum(x)>1,colnames(mouse_code)[which(x==max(x))],5)})
mouse_code_cluster
par(mfcol=c(2,2))
plot(mouse_som,"property",property = getCodes(mouse_som)[,1],palette.name = coolBlueHotRed,main=colnames(getCodes(mouse_som))[1])
add.cluster.boundaries(mouse_som,mouse_code_cluster)
plot(mouse_som,"property",property = getCodes(mouse_som)[,2],palette.name = coolBlueHotRed,main=colnames(getCodes(mouse_som))[2])
add.cluster.boundaries(mouse_som,mouse_code_cluster)
plot(mouse_som,"property",property = getCodes(mouse_som)[,3],palette.name = coolBlueHotRed,main=colnames(getCodes(mouse_som))[3])
add.cluster.boundaries(mouse_som,mouse_code_cluster)
plot(mouse_som,"property",property = getCodes(mouse_som)[,4],palette.name = coolBlueHotRed,main=colnames(getCodes(mouse_som))[4])
add.cluster.boundaries(mouse_som,mouse_code_cluster)
#做熱圖看差異基因的結(jié)果
mouse_som_genelist<-data.frame(gene=rownames(sample_mouse),cluster1=mouse_som$unit.classif)
mouse_code_df<-data.frame(cluster1=str_split(names(mouse_code_cluster),"V",simplify = T)[,2],cluster2=mouse_code_cluster)
mouse_som_genelist<-merge(mouse_som_genelist,mouse_code_df,by="cluster1")
mouse_som_genelist<-mouse_som_genelist[order(factor(mouse_som_genelist$cluster2)),]
rownames(mouse_som_genelist)<-mouse_som_genelist$gene
head(mouse_som_genelist)
pheatmap(sample_mouse[as.character(mouse_som_genelist$gene),],
cluster_rows = F,cluster_cols = F,show_rownames = F,show_colnames = F,
annotation_row =mouse_som_genelist[,-2])
sample_mouse_scale<-readRDS("../20200604human_mouse/data/tang_mouse.rds")
sample_mouse_scale<-sample_mouse_scale[rownames(mouse_som_genelist),names(sample_mouse_group)]
sample_mouse_scale<-CreateSeuratObject(sample_mouse_scale)
sample_mouse_scale<-NormalizeData(sample_mouse_scale)%>%ScaleData()
sample_mouse_scale<-sample_mouse_scale@assays$RNA@scale.data
head(sample_mouse_scale)
pheatmap(sample_mouse_scale[,order(factor(sample_mouse_group))],
show_rownames = F,show_colnames = F,
cluster_rows = F,cluster_cols = F,
annotation_row = mouse_som_genelist[,-2],
annotation_col = sample_anno_col,
color = colorRampPalette(c("purple","black","yellow"))(100),
breaks = seq(-4,4,8/100))
#與findmark的比較
sample_mouse_scale<-readRDS("../20200604human_mouse/data/tang_mouse.rds")
sample_mouse_scale<-sample_mouse_scale[rownames(mouse_som_genelist),names(sample_mouse_group)]
sample_mouse_scale<-CreateSeuratObject(sample_mouse_scale)
sample_mouse_scale@active.ident<-as.factor(sample_mouse_group)
sample_mouse_scale<-NormalizeData(sample_mouse_scale)%>%ScaleData()
sample_mouse_find_marker<-FindAllMarkers(sample_mouse_scale,only.pos = T,test.use = "t")
sample_mouse_find_marker[1:6,]
library(UpSetR)
sample_seurat_marker_res<-sample_mouse_find_marker[sample_mouse_find_marker$p_val<0.05,]
sample_seurat_marker_res<-tapply(sample_seurat_marker_res$gene,sample_seurat_marker_res$cluster,print)
sample_som_marker_res<-tapply(as.character(mouse_som_genelist$gene),mouse_som_genelist$cluster2,print)
names(sample_seurat_marker_res)
names(sample_som_marker_res)
a=4
sample_mark_compare<-fromList(list(seurat_1=sample_seurat_marker_res[[a]],
som_1=sample_som_marker_res[[a+1]]))
upset(sample_mark_compare)
sample_mouse_scale<-readRDS("../20200604human_mouse/data/tang_mouse.rds")
sample_mouse_scale<-sample_mouse_scale[rownames(mouse_som_genelist),names(sample_mouse_group)]
sample_mouse_scale<-CreateSeuratObject(sample_mouse_scale)%>%NormalizeData()%>%FindVariableFeatures%>%ScaleData()%>%
RunPCA()%>%RunUMAP(dim=1:10)
sample_mouse_scale@active.ident<-as.factor(sample_mouse_group)
VlnPlot(sample_mouse_scale,sample_seurat_marker_res$RS1o2[1:10])
#########################################################################
#結(jié)論,感覺效果挺差的,如果用來處理組內(nèi)均一性很好的樣本的話應該效果會不錯但是用來處理單細胞,感覺emmmm
#########################################################################
修改一下上面錯誤的結(jié)論
通過如下代碼的更細致的觀察
發(fā)現(xiàn)在處理單細胞的數(shù)據(jù)中,使用SOM方法找到的模式表達基因更傾向于特異性,缺點是會受一些低表達基因異常值的影響,而錯將低表達基因也納入特異性表達基因列表,因此使用前需要對低表達基因做細致篩查。(簡單看了一下基本上som單獨篩選的都是低表達的基因,但是在熱圖展示上由于是根據(jù)行scale的,因此很難看出來,所以我們需要做之前細致篩查?。?br>
普通的差異分析方法優(yōu)點是能夠過濾低表達基因,但是在模式的特異性上可能比較差的。涉及到至少有一組顯著就顯著還是唯一顯著這樣的問題,尋找差異基因時一般是滿足前者即可,而我們在尋找階段性特異表達基因的時候都希望是唯一顯著!
#比較兩種方法篩到基因的異同
sample_mouse_scale<-readRDS("../20200604human_mouse/data/tang_mouse.rds")
sample_mouse_scale<-sample_mouse_scale[rownames(mouse_som_genelist),names(sample_mouse_group)]
sample_mouse_scale<-CreateSeuratObject(sample_mouse_scale)%>%NormalizeData()%>%FindVariableFeatures%>%ScaleData()%>%
RunPCA()%>%RunUMAP(dim=1:10)
sample_mouse_scale@active.ident<-as.factor(sample_mouse_group)
#我們以二者結(jié)果相差最大的RS3o4組作為比較
#二者篩到的基因
VlnPlot(sample_mouse_scale,intersect(sample_som_marker_res$RS3o4,sample_seurat_marker_res$RS3o4)[sample(1:50,5)])
#som篩到 findmarker沒篩到
VlnPlot(sample_mouse_scale,setdiff(sample_som_marker_res$RS3o4,sample_seurat_marker_res$RS3o4)[sample(1:50,5)])
#som沒有篩到,findmaker篩到了
VlnPlot(sample_mouse_scale,setdiff(sample_seurat_marker_res$RS3o4,sample_som_marker_res$RS3o4)[sample(1:50,5)])

兩種方法篩到基因的差別

二者都篩到的基因

僅som篩到

僅findmarke篩到
尋找階段特異性基因,可以通過設(shè)定codebook vector閾值的方式查找,從下圖我們可以看到,在第30個神經(jīng)元中,四類細胞的值分別為1.4840752 -0.3065276 -0.6562483 -0.5212993,因為該神經(jīng)元下的基因的表達情況應盡量與該權(quán)重向量靠近,因此我們有理由推斷該神經(jīng)元下的基因在第一類細胞中表達很高,在其它三類細胞中表達很低,如下圖所示

Rplot05.png

Rplot06.png