學(xué)習(xí)單細胞分析—(2)R包下載及數(shù)據(jù)導(dǎo)入

Seurat有V5,V4版本,它們的數(shù)據(jù)格式略微有點不同,如果想使用不同版本,可放在不同的R包保存路徑中,避免沖突。
而且安裝這個包前,需要把它相關(guān)的依賴包安裝上,才能安裝成功。

rm(list=ls())
options(stringsAsFactors = F)

# 創(chuàng)建個人庫
#dir.create("~/R_libs/SeuratV4", recursive = TRUE)
## 你需要先安裝這個
lib = "~/R_libs/SeuratV4"
.libPaths(c("~/R_libs/SeuratV4", .libPaths()))

# namespace ‘Matrix’ 1.6-0 is being loaded, but >= 1.6.4 is required
packageVersion("Matrix") 
#https://cran.r-project.org/src/contrib/Archive/Matrix/
install.packages("~/R_libs/SeuratV4/Matrix_1.6-4.tar.gz",repos = NULL, type="source",lib ="~/R_libs/SeuratV4")

install.packages("SeuratObject",lib ="~/R_libs/SeuratV4" )
#Tools for Single Cell Genomics ? Seurat https://satijalab.org/seurat/v
remotes::install_version("SeuratObject", "4.1.4", repos = c("https://satijalab.r-universe.dev", getOption("repos")))
remotes::install_version("Seurat", "4.4.0", repos = c("https://satijalab.r-universe.dev", getOption("repos")))

加載包

# 加載必要的包
# 使用時先改庫路徑
.libPaths(c("~/R_libs/SeuratV4", .libPaths()))

library(Matrix)#1.6-4,一般安裝完需要重啟下R
library(Seurat)
packageVersion("Seurat")   # 應(yīng)返回 4.4.0 或 4.3.x
library(tidyverse)
library(patchwork)
library(ggplot2)
library(cowplot)
library(data.table)
library(dplyr)

加載數(shù)據(jù)

###1.加載數(shù)據(jù)
#單細胞數(shù)據(jù)保存的目錄
dir='/home/dyhscRNA'
#samples=list.files( dir )
samples <-c("GSM6213958","GSM6213959","GSM6213961","GSM6213976","GSM6213980","GSM6213983")
#前面三個是PD-1,后面三個untreated
samples
# dir/pro/outs/filtered_gene_bc_matrices/hg19/
# 里面有 matrix.mtx, features.tsv, barcodes.tsv

###2. 創(chuàng)建Seurat對象
sceList = lapply(samples,function(pro){ 
  # pro=samples[1] 
  print(pro)  
  tmp = Read10X(file.path(dir,pro )) 
  if(length(tmp)==2){
    ct = tmp[[1]] 
  }else{ct = tmp}
  sce =CreateSeuratObject(counts =  ct ,
                          project =  pro  ,
                          min.cells = 5,
                          min.features = 300 )
  return(sce)
}) 
do.call(rbind,lapply(sceList, dim))
sce.all=merge(x=sceList[[1]],
              y=sceList[ -1 ],
              add.cell.ids = samples  ) 

sce.all@assays$RNA@counts[1:5,1:5]
save(sce.all,file = "sce.all")

load("sce.all")
table(sce.all$orig.ident)
sce.all@meta.data[1:3,1:3]

### 3. 數(shù)據(jù)預(yù)處理
#統(tǒng)計一些關(guān)鍵基因的比例,如線粒體基因,核糖體基因,紅細胞基因的比例,根據(jù)這個比例進一步過濾
{
 sce.all[["percent.mt"]] <- PercentageFeatureSet(object = sce.all, pattern = "^MT-") # 人是MT,小鼠是mt
sce.all[["percent.rb"]] <- PercentageFeatureSet(object = sce.all, pattern = "^RP[SL]") # 計算核糖體基因比例
HB.genes <- c("HBA1","HBA2","HBB","HBD","HBE1","HBG1","HBG2","HBM","HBQ1","HBZ") # 計算紅細胞基因比例
HB.genes <- CaseMatch(HB.genes, rownames(sce.all))
sce.all[["percent.HB"]] <- PercentageFeatureSet(object = sce.all, features=HB.genes)

# Visualize QC metrics as a violin plot
# 對 pbmc 對象做小提琴圖,分別為基因數(shù),細胞數(shù)和線粒體占比,核糖體基因占比,紅細胞基因占比
VlnPlot(object = sce.all, features = c("nFeature_RNA", "nCount_RNA", "percent.mt","percent.rb","percent.HB"), ncol = 3)

# 接下來,根據(jù)圖片中基因數(shù)和線粒體數(shù),分別設(shè)置過濾參數(shù),這里基因數(shù) 200-2500,線粒體百分比為小于 5%
sce.all <- subset(x = sce.all, subset = nFeature_RNA > 200 & nFeature_RNA < 7500)
}
#但這里我發(fā)現(xiàn)這個數(shù)據(jù)的這部分基因奇奇怪怪的,且每個樣本的差異太大了,不過濾了。

#所以直接進行下一步
#數(shù)據(jù)歸一化處理,LogNormalize目的是讓整體的數(shù)據(jù)服從正態(tài)分布
sce.all<-NormalizeData(sce.all,
                       normalization.method="LogNormalize",
                       scale.factor=1e4)

#查找高變基因,指一些基因在細胞中表達的浮動比較大,這些往往是我們后續(xù)分群的時候需要關(guān)注的
# 識別高變基因
sce.all <- FindVariableFeatures(sce.all, selection.method = "vst", nfeatures = 2000)
#有 3 種選擇高表達變異基因的方法,可以通過 selection.method參數(shù)來選擇,它們分別是:vst(默認(rèn)值), mean.var.plot (mvp) 和 dispersion (disp)。
# Identify the top highly variable genes
top <- head(x = VariableFeatures(object = sce.all), 5)
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(object = sce.all)
plot2 <- LabelPoints(plot = plot1, points = top, repel = TRUE)
plot2

#基因歸一化
# 這里設(shè)置對所有的基因都做了scale,但是需要知道的是,其實后續(xù)的分析都是基于高變基因的,因此,使用默認(rèn)參數(shù)就可以了,而且提升效率
all.genes <- rownames(x = sce.all)
sce.all <- ScaleData(object = sce.all, features = all.genes) # 對所有基因做歸一化處理

# 線性PCA降維
sce.all <- RunPCA(object = sce.all, features = VariableFeatures(object = sce.all))
DimPlot(object = sce.all, reduction = "pca")
VizDimLoadings(sce.all, dims = 1:2, reduction = "pca")
DimHeatmap(sce.all, dims = 1:15, cells = 500, balanced = TRUE)

# 鑒定數(shù)據(jù)集的可用維度,虛線以上的為可用維度
sce.all <- JackStraw(object = sce.all, num.replicate = 100)
sce.all <- ScoreJackStraw(object = sce.all, dims = 1:20)
JackStrawPlot(object = sce.all, dims = 1:20)
JackStrawPlot(object = sce.all, dims = c(1:15))
# 另外一種鑒定手段是繪制所有PC的分布點圖,拐點處的pc作為選定數(shù)目
ElbowPlot(sce.all)

saveRDS(sce.all,'sc_int.rds')#太大了,內(nèi)存

rm(list=ls())
merged_seurat <- readRDS('sc_int.rds') # 讀取 rds
## 聚類
merged_seurat <- FindNeighbors(merged_seurat, dims = 1:20)
merged_seurat <- FindClusters(merged_seurat, resolution = 1.2)

# UMAP降維
merged_seurat <- RunUMAP(merged_seurat, dims = 1:20)
DimPlot(merged_seurat,reduction = 'umap',label=TRUE)



?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容