一次完全失敗的嘗試
COMPLETE FAILURE
以 HTML table 形式呈現(xiàn)芯片數(shù)據(jù)的好處是可以加上各個(gè)公共注釋數(shù)據(jù)庫的鏈接,便于下游分析,而且小巧便捷,用瀏覽器可以直接打開。
制作這樣的 HTML table 的先決條件是要有所用芯片的注釋信息,當(dāng)然制造商一般會(huì)提供可導(dǎo)入 R 的文件格式,例如用 read.csv() 就可以加載 Affymetrix 的 csv 文件。
1. 加載包&數(shù)據(jù)準(zhǔn)備
以 Affymetrix HG-U95Av2 芯片為例,隨機(jī)取15個(gè)基因:
library(annotate)
data("sample.ExpressionSet")
igenes <- featureNames(sample.ExpressionSet)[246:260]
2. 準(zhǔn)備注釋數(shù)據(jù)
library(hgu95av2.db)
## GenBank ID
gb <- as.character(unlist(lookUp(igenes, "hgu95av2", "ACCNUM")))
gb
# [1] "M57423" "Z70218" "L17328" "S81916" "U63332" "M77235"
# [7] "X98175" "AB019392" "J03071" "D25272" "D63789" "D63789"
# [13] "U19142" "U19147" "X16863"
## LocusLink ID
ll <- as.character(unlist(lookUp(igenes, "hgu95av2", "ENTREZID")))
ll
# [1] "221823" "4330" "9637" "5230" "4287" "6331" "841" "27335"
# [9] NA NA "6846" NA NA NA "2215"
## UniGene ID
ug <- as.character(unlist(lookUp(igenes, "hgu95av2", "UNIGENE")))
ug
# [1] "Hs.169284" "Hs.268515" "Hs.258563" "Hs.78771" "Hs.532632" "Hs.517898"
# [7] "Hs.599762" "Hs.314359" NA NA "Hs.458346" NA
# [13] NA NA "Hs.372679" "Hs.694258" "Hs.736230"
## SwissProt ID 一個(gè)探針對(duì)應(yīng)多個(gè)UNIPROT ID,保留為list格式
sp <- lookUp(igenes, "hgu95av2", "UNIPROT")
names(sp) <- NULL
sp
# [[1]]
# [1] "P21108"
#
# [[2]]
# [1] "A0A024R1C3" "Q10571"
#
# [[3]]
# [1] "Q9UHY8"
#
# [[4]]
# [1] "P00558" "V9HWF4"
#
# [[5]]
# [1] "P54252" "A0A0A0MS38" "C9JQV6"
#
# [[6]]
# [1] "Q14524" "Q86V90" "H9KVD2" "E9PG18" "E9PHB6"
# [6] "K4DIA1" "A0A0A0MT39"
#
# [[7]]
# [1] "A0A024R3Z8" "Q14790"
#
# [[8]]
# [1] "K7ERF1" "U3LUI4" "K7EK53" "Q9UBQ5"
#
# [[9]]
# [1] NA
#
# [[10]]
# [1] NA
#
# [[11]]
# [1] "Q9UBD3"
#
# [[12]]
# [1] NA
#
# [[13]]
# [1] NA
#
# [[14]]
# [1] NA
#
# [[15]]
# [1] "O75015" "M9MML6" "A0A087WZR4" "A0A087WU90"
需要把 NA 轉(zhuǎn)換為 "   ; ",原因嘛,我也說不清....
Because the code for SwissProt IDs will not automatically handle missing data, we have to convert the missing data to an HTML empty cell identifier ("?")
## 寫個(gè)循環(huán)改一下
for(i in 1:length(names)) {
if(i == 9 || i == 10 || i ==12 ||
i ==13 || i == 14)
names[i] <- " "
}
sp
# [[1]]
# [1] "P21108"
#
# [[2]]
# [1] "A0A024R1C3" "Q10571"
#
# [[3]]
# [1] "Q9UHY8"
#
# [[4]]
# [1] "P00558" "V9HWF4"
#
# [[5]]
# [1] "P54252" "A0A0A0MS38" "C9JQV6"
#
# [[6]]
# [1] "Q14524" "Q86V90" "H9KVD2" "E9PG18" "E9PHB6"
# [6] "K4DIA1" "A0A0A0MT39"
#
# [[7]]
# [1] "A0A024R3Z8" "Q14790"
#
# [[8]]
# [1] "K7ERF1" "U3LUI4" "K7EK53" "Q9UBQ5"
#
# [[9]]
# [1] " "
#
# [[10]]
# [1] " "
#
# [[11]]
# [1] "Q9UBD3"
#
# [[12]]
# [1] " "
#
# [[13]]
# [1] " "
#
# [[14]]
# [1] " "
#
# [[15]]
# [1] "O75015" "M9MML6" "A0A087WZR4" "A0A087WU90"
## Gene Name
names <- as.character(unlist(lookUp(igenes, "hgu95av2", "GENENAME")))
names
# [1] "phosphoribosyl pyrophosphate synthetase 1 like 1"
# [2] "MN1 proto-oncogene, transcriptional regulator"
# [3] "fasciculation and elongation protein zeta 2"
# [4] "phosphoglycerate kinase 1"
# [5] "ataxin 3"
# [6] "sodium voltage-gated channel alpha subunit 5"
# [7] "caspase 8"
# [8] "eukaryotic translation initiation factor 3 subunit K"
# [9] NA
# [10] NA
# [11] "X-C motif chemokine ligand 2"
# [12] NA
# [13] NA
# [14] NA
# [15] "Fc fragment of IgG receptor IIIb"
for(i in 1:length(names)) {
if(i == 9 || i == 10 || i ==12 ||
i ==13 || i == 14)
names[i] <- " "
}
names
# [1] "phosphoribosyl pyrophosphate synthetase 1 like 1"
# [2] "MN1 proto-oncogene, transcriptional regulator"
# [3] "fasciculation and elongation protein zeta 2"
# [4] "phosphoglycerate kinase 1"
# [5] "ataxin 3"
# [6] "sodium voltage-gated channel alpha subunit 5"
# [7] "caspase 8"
# [8] "eukaryotic translation initiation factor 3 subunit K"
# [9] " "
# [10] " "
# [11] "X-C motif chemokine ligand 2"
# [12] " "
# [13] " "
# [14] " "
# [15] "Fc fragment of IgG receptor IIIb"
3. 一些相關(guān)的值
取前10個(gè)基因作為例子:
dat <- exprs(sample.ExpressionSet)[igenes,1:10]
FC <- rowMeans(dat[igenes,1:5]) - rowMeans(dat[igenes,6:10])
pval <- esApply(sample.ExpressionSet[igenes,1:10], 1, function(x) t.test(x[1:5], x[6:10])$p.value)
tstat <- esApply(sample.ExpressionSet[igenes,1:10], 1, function(x) t.test(x[1:5], x[6:10])$statistic)
4. 構(gòu)建 HTML table
genelist <- list(igenes, ug, ll, gb, sp)
filename <- "Interesting_genes.html"
title <- "An Artificial Set of Interesting Genes"
othernames <- list(names, round(tstat, 2), round(pval, 3), round(FC, 1), round(dat, 2))
head <- c("Probe ID", "UniGene", "LocusLink", "GenBank", "SwissProt", "Gene Name", "t-statistic", "p-value",
"Fold Change", "Sample 1", "Sample 2", "Sample 3", "Sample 4", "Sample 5", "Sample 6",
"Sample 7", "Sample 8", "Sample 9", "Sample 10")
repository <- list("affy", "ug", "en", "gb", "sp")
htmlpage(genelist, filename, title, othernames, head, repository = repository)
然而很絕望....

所以是為什么呢,一步一步和作者的代碼對(duì)照,只能發(fā)現(xiàn)??


區(qū)別主要就是這么個(gè)區(qū)別,于是只好再操作一波。
for(i in 1:15) {
if(i == 9 || i == 10 || i ==12 ||
i ==13 || i == 14)
ll[i] <- "---"
}
for(i in 1:15) {
if(i == 9 || i == 10 || i ==12 ||
i ==13 || i == 14)
ug[i] <- "---"
}
ll
# [1] "221823" "4330" "9637" "5230" "4287" "6331" "841" "27335" "---"
# [10] "---" "6846" "---" "---" "---" "2215"
ug
# [1] "Hs.169284" "Hs.268515" "Hs.258563" "Hs.78771" "Hs.532632" "Hs.517898"
# [7] "Hs.599762" "Hs.314359" "---" "---" "Hs.458346" "---"
# [13] "---" "---" "Hs.372679" "Hs.694258" "Hs.736230"
還是不行....一毛一樣的報(bào)錯(cuò)
絕望中打開了作者的 R Script

所以其實(shí)都是這樣來的嗎??
那么,如果,按照這些代碼從頭來一遍會(huì)怎樣呢?

fine 果然成功了。
而且也沒多么好看嘛,作者原話是 nice looking&pretty
“已經(jīng)折騰了半天就不能白折騰”
現(xiàn)在要開始做一些也許毫無意義的排錯(cuò)。
> gb_t <- as.character(unlist(lookUp(igenes, "hgu95av2", "ACCNUM")))
> gb
[1] "M57423" "Z70218" "L17328" "S81916" "U63332"
[6] "M77235" "X98175" "AB019392" "J03071" "D25272"
[11] "D63789" "D63789" "U19142" "U19147" "X16863"
> gb_t
[1] "M57423" "Z70218" "L17328" "S81916" "U63332"
[6] "M77235" "X98175" "AB019392" "J03071" "D25272"
[11] "D63789" "D63789" "U19142" "U19147" "X16863"
> class(gb)
[1] "character"
> class(gb_t)
[1] "character"
GenBank ID clear
> ll_t <- as.character(unlist(lookUp(igenes, "hgu95av2", "ENTREZID")))
> ll
[1] "221823" "4330" "9637" "---" "---" "6331" "841"
[8] "27335" "---" "---" "6375" "---" "2543" "2578"
[15] "2215"
> ll_t
[1] "221823" "4330" "9637" "5230" "4287" "6331" "841"
[8] "27335" NA NA "6846" NA NA NA
[15] "2215"
> for(i in 1:15) {
+ if(i == 9 || i == 10 || i ==12 ||
+ i ==13 || i == 14)
+ ll_t[i] <- "---"
+ }
> ll_t
[1] "221823" "4330" "9637" "5230" "4287" "6331" "841"
[8] "27335" "---" "---" "6846" "---" "---" "---"
[15] "2215"
> class(ll)
[1] "character"
> class(ll_t)
[1] "character"
LocusLink ID clear
> ug_t <- as.character(unlist(lookUp(igenes, "hgu95av2", "UNIGENE")))
> ug
[1] "Hs.169284" "Hs.268515" "Hs.103419" "Hs.380429" "---"
[6] "Hs.169331" "Hs.381231" "Hs.283781" "---" "---"
[11] "Hs.3195" "---" "Hs.176660" "Hs.272484" "Hs.372679"
> ug_t
[1] "Hs.169284" "Hs.268515" "Hs.258563" "Hs.78771" "Hs.532632"
[6] "Hs.517898" "Hs.599762" "Hs.314359" NA NA
[11] "Hs.458346" NA NA NA "Hs.372679"
[16] "Hs.694258" "Hs.736230"
> for(i in 1:15) {
+ if(i == 9 || i == 10 || i ==12 ||
+ i ==13 || i == 14)
+ ug_t[i] <- "---"
+ }
> ug_t
[1] "Hs.169284" "Hs.268515" "Hs.258563" "Hs.78771" "Hs.532632"
[6] "Hs.517898" "Hs.599762" "Hs.314359" "---" "---"
[11] "Hs.458346" "---" "---" "---" "Hs.372679"
[16] "Hs.694258" "Hs.736230"
> class(ug)
[1] "character"
> class(ug_t)
[1] "character"
UniGene ID clear
> sp_t <- lookUp(igenes, "hgu95av2", "UNIPROT")
> sp_t
$`31485_at`
[1] "P21108"
$`31486_s_at`
[1] "A0A024R1C3" "Q10571"
$`31487_at`
[1] "Q9UHY8"
$`31488_s_at`
[1] "P00558" "V9HWF4"
$`31489_at`
[1] "P54252" "A0A0A0MS38" "C9JQV6"
$`31490_at`
[1] "Q14524" "Q86V90" "H9KVD2" "E9PG18" "E9PHB6"
[6] "K4DIA1" "A0A0A0MT39"
$`31491_s_at`
[1] "A0A024R3Z8" "Q14790"
$`31492_at`
[1] "K7ERF1" "U3LUI4" "K7EK53" "Q9UBQ5"
$`31493_s_at`
[1] NA
$`31494_at`
[1] NA
$`31495_at`
[1] "Q9UBD3"
$`31496_g_at`
[1] NA
$`31497_at`
[1] NA
$`31498_f_at`
[1] NA
$`31499_s_at`
[1] "O75015" "M9MML6" "A0A087WZR4" "A0A087WU90"
> names(sp_t) <- NULL
> for(i in 1:15) {
+ if(i == 9 || i == 10 || i ==12 ||
+ i ==13 || i == 14)
+ sp_t[[i]][1] <- " "
+ }
> sp
[[1]]
[1] "P21108"
[[2]]
[1] "Q10571"
[[3]]
[1] "Q9UHY8"
[[4]]
[1] "Q16444"
[[5]]
[1] " "
[[6]]
[1] "Q14524 " " Q8IZC9 " " Q8WTQ6 " " Q8WWN5 " " Q96J69"
[[7]]
[1] "Q14790"
[[8]]
[1] "Q9UBQ5"
[[9]]
[1] " "
[[10]]
[1] " "
[[11]]
[1] "P47992"
[[12]]
[1] " "
[[13]]
[1] "Q13065 " " Q8IYC5"
[[14]]
[1] "Q13070"
[[15]]
[1] "O75015"
> sp_t
[[1]]
[1] "P21108"
[[2]]
[1] "A0A024R1C3" "Q10571"
[[3]]
[1] "Q9UHY8"
[[4]]
[1] "P00558" "V9HWF4"
[[5]]
[1] "P54252" "A0A0A0MS38" "C9JQV6"
[[6]]
[1] "Q14524" "Q86V90" "H9KVD2" "E9PG18" "E9PHB6"
[6] "K4DIA1" "A0A0A0MT39"
[[7]]
[1] "A0A024R3Z8" "Q14790"
[[8]]
[1] "K7ERF1" "U3LUI4" "K7EK53" "Q9UBQ5"
[[9]]
[1] " "
[[10]]
[1] " "
[[11]]
[1] "Q9UBD3"
[[12]]
[1] " "
[[13]]
[1] " "
[[14]]
[1] " "
[[15]]
[1] "O75015" "M9MML6" "A0A087WZR4" "A0A087WU90"
> class(sp)
[1] "list"
> class(sp_t)
[1] "list"
SwissProt ID clear
> name_t <- as.character(unlist(lookUp(igenes, "hgu95av2", "GENENAME")))
> name_t
[1] "phosphoribosyl pyrophosphate synthetase 1 like 1"
[2] "MN1 proto-oncogene, transcriptional regulator"
[3] "fasciculation and elongation protein zeta 2"
[4] "phosphoglycerate kinase 1"
[5] "ataxin 3"
[6] "sodium voltage-gated channel alpha subunit 5"
[7] "caspase 8"
[8] "eukaryotic translation initiation factor 3 subunit K"
[9] NA
[10] NA
[11] "X-C motif chemokine ligand 2"
[12] NA
[13] NA
[14] NA
[15] "Fc fragment of IgG receptor IIIb"
> for(i in 1:length(names)) {
+ if(i == 9 || i == 10 || i ==12 ||
+ i ==13 || i == 14)
+ name_t[i] <- " "
+ }
> name
[1] "hypothetical protein LOC221823"
[2] "meningioma (disrupted in balanced translocation) 1"
[3] "fasciculation and elongation protein zeta 2 (zygin II)"
[4] "Phosphoglycerate kinase {alternatively spliced}"
[5] " "
[6] "sodium channel, voltage-gated, type V, alpha polypeptide"
[7] "caspase 8, apoptosis-related cysteine protease"
[8] "muscle specific gene"
[9] " "
[10] " "
[11] "chemokine (C motif) ligand 1"
[12] " "
[13] "G antigen 1"
[14] "G antigen 6"
[15] "Fc fragment of IgG, low affinity IIIb, receptor for (CD16)"
> name_t
[1] "phosphoribosyl pyrophosphate synthetase 1 like 1"
[2] "MN1 proto-oncogene, transcriptional regulator"
[3] "fasciculation and elongation protein zeta 2"
[4] "phosphoglycerate kinase 1"
[5] "ataxin 3"
[6] "sodium voltage-gated channel alpha subunit 5"
[7] "caspase 8"
[8] "eukaryotic translation initiation factor 3 subunit K"
[9] " "
[10] " "
[11] "X-C motif chemokine ligand 2"
[12] " "
[13] " "
[14] " "
[15] "Fc fragment of IgG receptor IIIb"
> class(name)
[1] "character"
> class(name_t)
[1] "character"
Gene Name clear
## 這些都是一毛一樣的
dat <- exprs(sample.ExpressionSet)[igenes,1:10]
FC <- rowMeans(dat[igenes,1:5]) - rowMeans(dat[igenes,6:10])
pval <- esApply(sample.ExpressionSet[igenes,1:10], 1, function(x) t.test(x[1:5], x[6:10])$p.value)
tstat <- esApply(sample.ExpressionSet[igenes,1:10], 1, function(x) t.test(x[1:5], x[6:10])$statistic)
## 剩下的都對(duì)應(yīng)地加上"_t"
genelist_t <- list(igenes, ug_t, ll_t, gb_t, sp_t)
filename_t <- "Interesting_genes_t.html"
title_t <- "An Artificial Set of Interesting Genes"
othernames_t <- list(name_t, round(tstat, 2), round(pval, 3), round(FC, 1), round(dat, 2))
head_t <- c("Probe ID", "UniGene", "LocusLink", "GenBank", "SwissProt", "Gene Name", "t-statistic", "p-value",
"Fold Change", "Sample 1", "Sample 2", "Sample 3", "Sample 4", "Sample 5", "Sample 6",
"Sample 7", "Sample 8", "Sample 9", "Sample 10")
repository_t <- list("affy", "ug", "en", "gb", "sp")
htmlpage(genelist_t, filename_t, title_t, othernames_t, head_t, repository = repository_t)

行吧。

最后,向大家隆重推薦生信技能樹的一系列干貨!
- 生信技能樹全球公益巡講:https://mp.weixin.qq.com/s/E9ykuIbc-2Ja9HOY0bn_6g
- B站公益74小時(shí)生信工程師教學(xué)視頻合輯:https://mp.weixin.qq.com/s/IyFK7l_WBAiUgqQi8O7Hxw
- 招學(xué)徒:https://mp.weixin.qq.com/s/KgbilzXnFjbKKunuw7NVfw
