生信小白開始接活——文本處理(三)

經(jīng)過前幾天的學習,終于到了文本處理完結(jié)篇,就是站在前人的肩膀上處理自己的數(shù)據(jù),如下黃框中的數(shù)據(jù)(GSE178341),處理數(shù)據(jù)的過程艱辛異常,主要是沒有R語言的思維,代碼看得懂,但是不會用,只能照葫蘆畫瓢。


1.png

打開GEO界面,一共181個樣本,所有的樣本都在1個hd5文件中,從181個樣本中統(tǒng)計其中8個樣本,問題:1、怎樣讀取hd5文件 2、拆分邏輯(和上篇有異曲同工之妙)


2.png

OK,讓我們開始代碼之旅吧!

下載和讀取數(shù)據(jù)

1、下載數(shù)據(jù)

wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE178nnn/GSE178341/suppl/GSE178341_crc10x_full_c295v4_submit.h5

2、讀取hd5文件
tips:關于hdf5r包的安裝,之前無論用install還是BiocManager都缺少文件,后來險中求勝,用了conda安裝,引用了下述的安裝網(wǎng)址

R Hdf5R :: Anaconda.org

上篇有說過,在 Seurat 中,對于 Cellranger 數(shù)據(jù)的導入,它提供了兩個函數(shù) Read10X 和 Read10X_h5。前者用于讀取 Cellranger 生成的 3 個文件:barcodes.tsv,features.tsv,matrix.mtx,后者則用于讀取生成的 xxxxxx.h5文件,這里我們應用Read10X_h5函數(shù)就行

R
library(Seurat)
a <- Read10X_h5("/home/shpc_a98ef85f8f/SingleronTest/wuxuan/Jun/GSE178341_crc10x_full_c295v4_submit.h5")
a <- CreateSeuratObject(a)
head(a@meta.data, 8)
                                       orig.ident nCount_RNA nFeature_RNA
C103_T_1_1_0_c1_v2_id-AAACCTGCATGCTAGT       C103      15093         2953
C103_T_1_1_0_c1_v2_id-AAACCTGGTAGCCTAT       C103       1156          407
C103_T_1_1_0_c1_v2_id-AAACCTGGTTGTCGCG       C103      50243         6522
C103_T_1_1_0_c1_v2_id-AAACCTGTCATGTGGT       C103      16254         3363
C103_T_1_1_0_c1_v2_id-AAACCTGTCCTTGGTC       C103      42007         6151
C103_T_1_1_0_c1_v2_id-AAACCTGTCGGATGTT       C103       1937          898
C103_T_1_1_0_c1_v2_id-AAACCTGTCGTACGGC       C103       1812          659
C103_T_1_1_0_c1_v2_id-AAACCTGTCTTGCAAG       C103       6816         1300
tail(a@meta.data, 8)
                                        orig.ident nCount_RNA nFeature_RNA
C173_T_0_0_0_c1_v3_id-TTTGGAGAGGGCGAGA       C173       1400          570
C173_T_0_0_0_c1_v3_id-TTTGGAGAGTGATAAC       C173       1676          860
C173_T_0_0_0_c1_v3_id-TTTGGAGCAGCACGAA       C173      10529         2727
C173_T_0_0_0_c1_v3_id-TTTGGAGTCATCGGGC       C173      10047         3009
C173_T_0_0_0_c1_v3_id-TTTGGAGTCTAGTGTG       C173      19533         4219
C173_T_0_0_0_c1_v3_id-TTTGTTGCAGCAATTC       C173       1723          772
C173_T_0_0_0_c1_v3_id-TTTGTTGGTTCTGAGT       C173       1267          506
C173_T_0_0_0_c1_v3_id-TTTGTTGTCGTTCCCA       C173      39137         5151
table(a@meta.data$orig.ident) #查看細胞數(shù)?
 C103  C104  C105  C106  C107  C109  C110  C111  C112  C113  C114  C115  C116 
 6096  2886  3281  3392  7215  4244  5033  5044  6425  5088  4553  1995  5234 
 C118  C119  C122  C123  C124  C125  C126  C129  C130  C132  C133  C134  C135 
 3211  1534  7060 14572 16945 12660 12914 12577 11327  7570  3853  7675  5621 
 C136  C137  C138  C139  C140  C142  C143  C144  C145  C146  C147  C149  C150 
 6609  5774  3408 10468  6179  5140  6359  3899  3091  4731  3801  4015  2418 
 C151  C152  C153  C154  C155  C156  C157  C158  C159  C160  C161  C162  C163 
 3873  3224  4860  3049  6448  3975  8152  2352  2437  3826  3755 22587  4717 
 C164  C165  C166  C167  C168  C169  C170  C171  C172  C173 
 3760  4524  5527  1443  2552  2245 12522 16199  1542  2649 

拆分數(shù)據(jù)

1、利用tidyverse函數(shù)拆分數(shù)據(jù)

library(tidyverse)
a@meta.data$BC <- rownames(a@meta.data) #增加一列BC
head(a@meta.data)
                                       orig.ident nCount_RNA nFeature_RNA
C103_T_1_1_0_c1_v2_id-AAACCTGCATGCTAGT       C103      15093         2953
C103_T_1_1_0_c1_v2_id-AAACCTGGTAGCCTAT       C103       1156          407
C103_T_1_1_0_c1_v2_id-AAACCTGGTTGTCGCG       C103      50243         6522
C103_T_1_1_0_c1_v2_id-AAACCTGTCATGTGGT       C103      16254         3363
C103_T_1_1_0_c1_v2_id-AAACCTGTCCTTGGTC       C103      42007         6151
C103_T_1_1_0_c1_v2_id-AAACCTGTCGGATGTT       C103       1937          898
                                                                           BC
C103_T_1_1_0_c1_v2_id-AAACCTGCATGCTAGT C103_T_1_1_0_c1_v2_id-AAACCTGCATGCTAGT
C103_T_1_1_0_c1_v2_id-AAACCTGGTAGCCTAT C103_T_1_1_0_c1_v2_id-AAACCTGGTAGCCTAT
C103_T_1_1_0_c1_v2_id-AAACCTGGTTGTCGCG C103_T_1_1_0_c1_v2_id-AAACCTGGTTGTCGCG
C103_T_1_1_0_c1_v2_id-AAACCTGTCATGTGGT C103_T_1_1_0_c1_v2_id-AAACCTGTCATGTGGT
C103_T_1_1_0_c1_v2_id-AAACCTGTCCTTGGTC C103_T_1_1_0_c1_v2_id-AAACCTGTCCTTGGTC
C103_T_1_1_0_c1_v2_id-AAACCTGTCGGATGTT C103_T_1_1_0_c1_v2_id-AAACCTGTCGGATGTT

2、繼續(xù)拆分

str_split(a$BC,'_id-',simplify=T) 
 [49995,] "C114_N_1_1_2_c1_v2"  "TAAGCGTTCGTACGGC"
 [49996,] "C114_N_1_1_2_c1_v2"  "TAAGTGCCACGGTAAG"
 [49997,] "C114_N_1_1_2_c1_v2"  "TAAGTGCCAGCTGTAT"
 [49998,] "C114_N_1_1_2_c1_v2"  "TAAGTGCGTTACGTCA"
 [49999,] "C114_N_1_1_2_c1_v2"  "TAAGTGCTCATCTGCC"
 [ reached getOption("max.print") -- omitted 320116 rows ]

str_split(a$BC,'_id-',simplify=T) %>% head()
     [,1]                 [,2]              
[1,] "C103_T_1_1_0_c1_v2" "AAACCTGCATGCTAGT"
[2,] "C103_T_1_1_0_c1_v2" "AAACCTGGTAGCCTAT"
[3,] "C103_T_1_1_0_c1_v2" "AAACCTGGTTGTCGCG"
[4,] "C103_T_1_1_0_c1_v2" "AAACCTGTCATGTGGT"
[5,] "C103_T_1_1_0_c1_v2" "AAACCTGTCCTTGGTC"
[6,] "C103_T_1_1_0_c1_v2" "AAACCTGTCGGATGTT"
a$Sample<- str_split(a$BC,'_id-',simplify=T)[,1]
a$rawBC <- str_split(a$BC,'_id-',simplify=T)[,2] 
head(a@meta.data)
                                       orig.ident nCount_RNA nFeature_RNA
C103_T_1_1_0_c1_v2_id-AAACCTGCATGCTAGT       C103      15093         2953
C103_T_1_1_0_c1_v2_id-AAACCTGGTAGCCTAT       C103       1156          407
C103_T_1_1_0_c1_v2_id-AAACCTGGTTGTCGCG       C103      50243         6522
C103_T_1_1_0_c1_v2_id-AAACCTGTCATGTGGT       C103      16254         3363
C103_T_1_1_0_c1_v2_id-AAACCTGTCCTTGGTC       C103      42007         6151
C103_T_1_1_0_c1_v2_id-AAACCTGTCGGATGTT       C103       1937          898
                                                                           BC
C103_T_1_1_0_c1_v2_id-AAACCTGCATGCTAGT C103_T_1_1_0_c1_v2_id-AAACCTGCATGCTAGT
C103_T_1_1_0_c1_v2_id-AAACCTGGTAGCCTAT C103_T_1_1_0_c1_v2_id-AAACCTGGTAGCCTAT
C103_T_1_1_0_c1_v2_id-AAACCTGGTTGTCGCG C103_T_1_1_0_c1_v2_id-AAACCTGGTTGTCGCG
C103_T_1_1_0_c1_v2_id-AAACCTGTCATGTGGT C103_T_1_1_0_c1_v2_id-AAACCTGTCATGTGGT
C103_T_1_1_0_c1_v2_id-AAACCTGTCCTTGGTC C103_T_1_1_0_c1_v2_id-AAACCTGTCCTTGGTC
C103_T_1_1_0_c1_v2_id-AAACCTGTCGGATGTT C103_T_1_1_0_c1_v2_id-AAACCTGTCGGATGTT
                                                   Sample            rawBC
C103_T_1_1_0_c1_v2_id-AAACCTGCATGCTAGT C103_T_1_1_0_c1_v2 AAACCTGCATGCTAGT
C103_T_1_1_0_c1_v2_id-AAACCTGGTAGCCTAT C103_T_1_1_0_c1_v2 AAACCTGGTAGCCTAT
C103_T_1_1_0_c1_v2_id-AAACCTGGTTGTCGCG C103_T_1_1_0_c1_v2 AAACCTGGTTGTCGCG
C103_T_1_1_0_c1_v2_id-AAACCTGTCATGTGGT C103_T_1_1_0_c1_v2 AAACCTGTCATGTGGT
C103_T_1_1_0_c1_v2_id-AAACCTGTCCTTGGTC C103_T_1_1_0_c1_v2 AAACCTGTCCTTGGTC
C103_T_1_1_0_c1_v2_id-AAACCTGTCGGATGTT C103_T_1_1_0_c1_v2 AAACCTGTCGGATGTT
Leo <- SplitObject(a,split.by='Sample')
Leo
$C172_T_0_0_0_c1_v3
An object of class Seurat 
43113 features across 1542 samples within 1 assay 
Active assay: RNA (43113 features, 0 variable features)

$C173_T_0_0_0_c1_v3
An object of class Seurat 
43113 features across 2649 samples within 1 assay 
Active assay: RNA (43113 features, 0 variable features)

至此,數(shù)據(jù)拆分結(jié)束

統(tǒng)計數(shù)據(jù)

Leo$C162_N_0_0_0_c2_v2 -> GSM5388125
median(GSM5388125 @meta.data$nFeature_RNA)
[1] 944
median(GSM5388125 @meta.data$nCount_RNA)
[1] 3715.5
GSM5388125[["percent.mt"]] <- PercentageFeatureSet( GSM5388125, pattern = "^MT-")
median(GSM5388125@meta.data$percent.mt)
[1] 5.834534
MT5 <- subset(GSM5388125,percent.mt < 5)
MT5
An object of class Seurat 
43113 features across 1357 samples within 1 assay 
Active assay: RNA (43113 features, 0 variable features)
MT30 <- subset(GSM5388125,percent.mt < 30)
MT30
An object of class Seurat 
43113 features across 2880 samples within 1 assay 
Active assay: RNA (43113 features, 0 variable features)

終于統(tǒng)計完了 這周KPI圓滿結(jié)束 ,下周繼續(xù)更新富集分析
周末愉快

?著作權歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務。

相關閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容