數(shù)據(jù)導(dǎo)入:這里使用valr包自帶的數(shù)據(jù)
library(valr)
bedfile <- valr_example("genes.hg19.chr22.bed.gz") ## 基因或者目標(biāo)區(qū)域坐標(biāo)信息
genomefile <- valr_example("hg19.chrom.sizes.gz") ## 染色體長度信息
bgfile <- valr_example("hela.h3k4.chip.bg.gz") ## 信號值文件
genes <- read_bed(bedfile, n_fields = 6)
genome <- read_genome(genomefile)
y <- read_bedgraph(bgfile)
導(dǎo)入前的數(shù)據(jù)格式為

image.png
寫成函數(shù), 計(jì)算區(qū)域內(nèi)的信號值
get_right_Gene_bins_cov <- function(bedfile, num_win, y){
## bedfile 即我們的基因區(qū)域文件
## num_win 表示我們需要劃分成多少個bin
## y 表示我們需要的信號值
F_Genes <- bedfile %>%
filter(strand == "+") %>%
bed_makewindows(num_win = num_win)
## 篩選正向的坐標(biāo),如果為正向,可以直接正常分
R_Genes <- bedfile %>%
filter(strand == "-") %>%
bed_makewindows(num_win = num_win, reverse = T)
## 如果是反向的,那么最后分組的順序倒置過來
## 操作見之前的valr說明
new_Genes <- rbind(F_Genes, R_Genes)
## 合并得到的正向和反向的文件
GeneBody <- bed_map(new_Genes, y, win_sum = sum(value, na.rm = TRUE)) %>%
## bed_map 即根據(jù)所提供的信號值文件y, 計(jì)算我們目標(biāo)分好的每一個bin中的信號值的總數(shù)
group_by(.win_id) %>%
## 按照我們劃分bin時候,bed_makewindows生成的20個bin的編號來分組
summarize(win_mean = mean(win_sum, na.rm = TRUE))
## 計(jì)算每一個bin中對應(yīng)的所有區(qū)域數(shù)據(jù)的信號值的平均值
}
通過函數(shù)計(jì)算genebody區(qū)的信號值,將其劃分為20個bin,最后的提取信號值列
GeneBody <- get_right_Gene_bins_cov(genes, 20, y)[,2]
計(jì)算Promoter區(qū)每一個bin中的信號
region_size <- 2000
# 50 bp windows
win_size <- 100
bin_number <- region_size *2 /win_size
## 即 Promoter區(qū)域加downstream區(qū)域總共40個bin
#### 獲取Promoter的信息
F_promoter <- genes %>%
filter(strand == "+") %>%
bed_flank(genome, left = region_size) %>%
get_right_Gene_bins_cov(20, y)
R_promoter <- genes %>%
filter(strand == "-") %>%
bed_flank(genome, right = region_size) %>%
get_right_Gene_bins_cov(20, y)
## 再將正負(fù)信息合并,取平均值
Promoter <- data.frame(win_mean = apply(merge(F_promoter, R_promoter, by = ".win_id")[,2:3], 1, mean))
計(jì)算downstream區(qū)每一個bin中的信號
#### 獲取down的信息
F_down <- genes %>%
filter(strand == "+") %>%
bed_flank(genome, right = region_size) %>%
get_right_Gene_bins_cov(20, y)
R_down <- genes %>%
filter(strand == "-") %>%
bed_flank(genome, left = region_size) %>%
get_right_Gene_bins_cov(20, y)
## 再將正負(fù)信息合并,取平均值
Down <- data.frame(win_mean = apply(merge(F_down, R_down, by = ".win_id")[,2:3], 1, mean))
合并Promoter、GeneBody、Downstream三個區(qū)域的信息,得到一個20*3 = 60行的矩陣
bind_cov <- rbind(Promoter, GeneBody, Down)
end_cov <- cbind(.win_id = seq_len(nrow(bind_cov)), bind_cov)
# 即前20行為Promoter區(qū),中間20行為GeneBody區(qū),最后20行為Downstream區(qū)
得到了矩陣就很方便了,使用ggplot2畫圖
library(ggplot2)
ggplot(end_cov, aes(x = .win_id, y = win_mean)) +
geom_point(size = 0.25) + geom_line(size = 1) +
scale_x_continuous(labels = c("-2Kb","TSS", "TES", "2Kb"), breaks = seq(0,bin_number*3/2, by = bin_number/2)) +
theme_bw() + xlab("") + ylab("Signal") +
theme(legend.key=element_rect(linetype='dashed',color="white"),
axis.text.y = element_text(size=20),axis.text.x = element_text(size=20),
legend.title = element_text(size=15),legend.text = element_text(size=15),
legend.key.height=unit(1.2,'cm'))

image.png
將信號值取log2進(jìn)行可視化
ggplot(end_cov, aes(x = .win_id, y = log2(win_mean))) +
geom_point(size = 0.25) + geom_line(size = 1) +
scale_x_continuous(labels = c("-2Kb","TSS", "TES", "2Kb"), breaks = seq(0,bin_number*3/2, by = bin_number/2)) +
theme_bw() + xlab("") + ylab("Signal") +
theme(legend.key=element_rect(linetype='dashed',color="white"),
axis.text.y = element_text(size=20),axis.text.x = element_text(size=20),
legend.title = element_text(size=15),legend.text = element_text(size=15),
legend.key.height=unit(1.2,'cm'))

image.png
最后補(bǔ)充關(guān)于bed_makewindows函數(shù),舉例
> x <- trbl_interval(~chrom, ~start, ~end, ~name, ~score, ~strand,
"chr1", 100, 200,'A','.','+',
"chr1", 300, 400, 'B', '.', '-')
> bed_makewindows(x, num_win = 5)
# A tibble: 10 x 7
chrom start end name score strand .win_id
<chr> <int> <int> <chr> <chr> <chr> <int>
1 chr1 100 120 A . + 1
2 chr1 120 140 A . + 2
3 chr1 140 160 A . + 3
4 chr1 160 180 A . + 4
5 chr1 180 200 A . + 5
6 chr1 300 320 B . - 1
7 chr1 320 340 B . - 2
8 chr1 340 360 B . - 3
9 chr1 360 380 B . - 4
10 chr1 380 400 B . - 5
> bed_makewindows(x, num_win = 5, reverse = T)
# A tibble: 10 x 7
chrom start end name score strand .win_id
<chr> <int> <int> <chr> <chr> <chr> <int>
1 chr1 100 120 A . + 5
2 chr1 120 140 A . + 4
3 chr1 140 160 A . + 3
4 chr1 160 180 A . + 2
5 chr1 180 200 A . + 1
6 chr1 300 320 B . - 5
7 chr1 320 340 B . - 4
8 chr1 340 360 B . - 3
9 chr1 360 380 B . - 2
10 chr1 380 400 B . - 1
可以看出來,在不指定reverse參數(shù)為True時候,只有正方向的bin的劃分格式為正確的,反方向的bin劃分的時候應(yīng)該為加了reverse參數(shù)為True時候這種劃分方式,即應(yīng)該是反的。所以我們區(qū)分正負(fù)方向劃分bin的時候只需先將正負(fù)方向各自提取出來,然后正方向按照正常的方式來劃分bin,反方向按照reverse反方向劃分bin,然后在合并兩個文件的bin的信息,同一個bin取均值即可。
同理,我們在得到啟動子和下游2kb區(qū)間時候利用bed_flank函數(shù)的right 和 left參數(shù),劃分好正負(fù)反向然后合并,即可。
image.png
