GFF或GTF格式轉bed

# 1. gff2bed和gtf2bed

首先gff2bed和gtf2bed都是BEDOPS的程序;所以使用之前需要安裝# BEDOPS;

## Linux平臺安裝BEDOPS

$ git clone https://github.com/bedops/bedops.git
$ cd bedops
$ make
$ make install

復制可執(zhí)行文件到環(huán)境路徑;

$ cp bin/* /usr/local/bin

## 使用

GFF文件和GTF文件均來自于gencode文件。
GFF文件:gencode.v19.annotation.gff3
GTF文件:gencode.v19.annotation.gtf

### GFF格式

gff2bed <gencode.v19.annotation.gff3 > test.bed
convert2bed -i gff -o bed <gencode.v19.annotation.gff3 > test.bed
  • 文件內容查看
    gencode.v19.annotation.gff3
##gff-version 3
#description: evidence-based annotation of the human genome (GRCh37), version 19 (Ensembl 74)
#provider: GENCODE
#contact: gencode@sanger.ac.uk
#format: gff3
#date: 2014-09-18
##sequence-region chr1 1 249250621
chr1    HAVANA  gene    11869   14412   .       +       .       ID=ENSG00000223972.4;gene_id=ENSG00000223972.4;transcript_id=ENSG0000
0223972.4;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcript_type=pseudogene;transcript_status=KNOWN;transcript_name
=DDX11L1;level=2;havana_gene=OTTHUMG00000000961.2
chr1    HAVANA  transcript      11869   14409   .       +       .       ID=ENST00000456328.2;Parent=ENSG00000223972.4;gene_id=ENSG000
00223972.4;transcript_id=ENST00000456328.2;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcript_type=processed_transcr
ipt;transcript_status=KNOWN;transcript_name=DDX11L1-002;level=2;havana_gene=OTTHUMG00000000961.2;havana_transcript=OTTHUMT00000362751
.1;tag=basic
chr1    HAVANA  exon    11869   12227   .       +       .       ID=exon:ENST00000456328.2:1;Parent=ENST00000456328.2;gene_id=ENSG0000
0223972.4;transcript_id=ENST00000456328.2;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcript_type=processed_transcri
pt;transcript_status=KNOWN;transcript_name=DDX11L1-002;exon_number=1;exon_id=ENSE00002234944.1;level=2;havana_gene=OTTHUMG00000000961
.2;havana_transcript=OTTHUMT00000362751.1;tag=basic
chr1    HAVANA  exon    12613   12721   .       +       .       ID=exon:ENST00000456328.2:2;Parent=ENST00000456328.2;gene_id=ENSG0000
0223972.4;transcript_id=ENST00000456328.2;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcript_type=processed_transcri
pt;transcript_status=KNOWN;transcript_name=DDX11L1-002;exon_number=2;exon_id=ENSE00003582793.1;level=2;havana_gene=OTTHUMG00000000961
.2;havana_transcript=OTTHUMT00000362751.1;tag=basic

test.bed

chr1    11868   12227   ENSG00000223972.4       .       +       HAVANA  exon    .       ID=exon:ENST00000456328.2:1;Parent=ENST000004
56328.2;gene_id=ENSG00000223972.4;transcript_id=ENST00000456328.2;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcript
_type=processed_transcript;transcript_status=KNOWN;transcript_name=DDX11L1-002;exon_number=1;exon_id=ENSE00002234944.1;level=2;havana
_gene=OTTHUMG00000000961.2;havana_transcript=OTTHUMT00000362751.1;tag=basic
chr1    11868   14409   ENSG00000223972.4       .       +       HAVANA  transcript      .       ID=ENST00000456328.2;Parent=ENSG00000
223972.4;gene_id=ENSG00000223972.4;transcript_id=ENST00000456328.2;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcrip
t_type=processed_transcript;transcript_status=KNOWN;transcript_name=DDX11L1-002;level=2;havana_gene=OTTHUMG00000000961.2;havana_trans
cript=OTTHUMT00000362751.1;tag=basic
chr1    11868   14412   ENSG00000223972.4       .       +       HAVANA  gene    .       ID=ENSG00000223972.4;gene_id=ENSG00000223972.
4;transcript_id=ENSG00000223972.4;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcript_type=pseudogene;transcript_stat
us=KNOWN;transcript_name=DDX11L1;level=2;havana_gene=OTTHUMG00000000961.2
chr1    11871   12227   ENSG00000223972.4       .       +       ENSEMBL exon    .       ID=exon:ENST00000515242.2:1;Parent=ENST000005
15242.2;gene_id=ENSG00000223972.4;transcript_id=ENST00000515242.2;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcript
_type=transcribed_unprocessed_pseudogene;transcript_status=KNOWN;transcript_name=DDX11L1-201;exon_number=1;exon_id=ENSE00002234632.1;
level=3;havana_gene=OTTHUMG00000000961.2
chr1    11871   14412   ENSG00000223972.4       .       +       ENSEMBL transcript      .       ID=ENST00000515242.2;Parent=ENSG00000
223972.4;gene_id=ENSG00000223972.4;transcript_id=ENST00000515242.2;gene_type=pseudogene;gene_status=KNOWN;gene_name=DDX11L1;transcrip
t_type=transcribed_unprocessed_pseudogene;transcript_status=KNOWN;transcript_name=DDX11L1-201;level=3;havana_gene=OTTHUMG00000000961.
2

### GTF 格式

gtf2bed <gencode.v19.annotation.gtf > test.bed
convert2bed -i gtf -o bed <gencode.v19.annotation.gtf > test.bed

# 2. 自己寫的shell命令

##GTF

cat gencode.v19.annotation.gtf | awk -F '[\t *;]' '/^chr/{if($3=="transcript"){print $1,$4-1,$5,$10,$13,$22,$7,$3}}' OFS="\t" >test.bed

cat gencode.v19.annotation.gtf |sed 's/;//' | awk -F '[\t *]' '/^chr/{if($3=="transcript"){print $1,$4-1,$5,$10,$12,$21,$7,$3}}' OFS="\t" >test.bed

## GFF

cat gencode.v19.annotation.gff3 | awk -F '[\t;]' '/^chr/{if($3=="exon"){print $1,$4-1,$5,$9,$11,$12,$15,$7,$3}}' OFS="\t" | sed -e 's/ID=//' -e 's/gene_id=//' -e 's/transcript_id=//' -e 's/gene_name=//' >test.bed
最后編輯于
?著作權歸作者所有,轉載或內容合作請聯(lián)系作者
【社區(qū)內容提示】社區(qū)部分內容疑似由AI輔助生成,瀏覽時請結合常識與多方信息審慎甄別。
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發(fā)布,文章內容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務。

相關閱讀更多精彩內容

友情鏈接更多精彩內容