注:腳本是用GPT生成的,請自己甄別。
- perl gff3_to_bed12.pl in.gff > out.bed12
cat gff3_to_bed12.pl
#!/usr/bin/perl
use strict;
use warnings;
# 用法:
# perl gff3_to_bed12.pl in.gff > out.bed12
#
# 要點(diǎn):
# - 以 mRNA 的 ID 作為轉(zhuǎn)錄本鍵與 BED12 的 name(若有 Name 則優(yōu)先)。
# - exon/CDS 通過 Parent 歸并到對應(yīng) mRNA。
# - 坐標(biāo):GFF 1-based 閉區(qū)間 -> BED 0-based 半開(start-1, end 原樣)。
# - 無 CDS 的轉(zhuǎn)錄本:thickStart=thickEnd=txStart(顯示為非編碼)。
# - blockSizes 與 blockStarts 不帶尾隨逗號。
my $gff = shift or die "Usage: $0 input.gff3 > output.bed12\n";
open my $IN, "<", $gff or die "Cannot open $gff: $!\n";
# 以轉(zhuǎn)錄本 ID 為鍵
my (%CHR, %STR, %NAME, %EXONS, %CDS_S, %CDS_E);
while (<$IN>) {
chomp;
next if /^#/;
my ($chr, undef, $type, $start, $end, undef, $strand, undef, $attr) = split(/\t/);
$type = lc $type;
# 解析屬性為哈希(split 限制 2,兼容像 Class== 之類的情況)
my %A;
for my $kv (split /;/, $attr) {
next unless length $kv;
my ($k, $v) = split(/=/, $kv, 2);
next unless defined $k;
$A{$k} = defined $v ? $v : '';
}
# 轉(zhuǎn)換為 0-based 半開
my $s0 = $start - 1;
my $e0 = $end;
# mRNA 或 transcript 行:記錄基礎(chǔ)信息
if ($type eq 'mrna' or $type eq 'transcript') {
my $tid = $A{ID} // next;
$CHR{$tid} = $chr;
$STR{$tid} = $strand;
$NAME{$tid} = $A{Name} // $tid;
}
# exon/CDS:按 Parent 歸并
elsif ($type eq 'exon' or $type eq 'cds') {
my $parent = $A{Parent} // next;
for my $tid (split /,/, $parent) {
$tid =~ s/^\s+|\s+$//g;
# 若 mRNA 行在后面出現(xiàn),先占位
$CHR{$tid} //= $chr;
$STR{$tid} //= $strand;
$NAME{$tid} //= $tid;
if ($type eq 'exon') {
push @{$EXONS{$tid}}, [$s0, $e0];
} else { # cds
$CDS_S{$tid} = $s0 if !defined $CDS_S{$tid} || $s0 < $CDS_S{$tid};
$CDS_E{$tid} = $e0 if !defined $CDS_E{$tid} || $e0 > $CDS_E{$tid};
}
}
}
}
close $IN;
# 輸出 BED12
for my $tid (keys %EXONS) {
my @ex = sort { $a->[0] <=> $b->[0] } @{$EXONS{$tid}};
next unless @ex; # 必須有外顯子
my $txS = $ex[0]->[0];
my $txE = $ex[-1]->[1];
my @blockSizes = map { $_->[1] - $_->[0] } @ex;
my @blockStarts = map { $_->[0] - $txS } @ex;
# CDS 范圍(無 CDS 則設(shè)為非編碼)
my ($thickS, $thickE) = (defined $CDS_S{$tid} && defined $CDS_E{$tid})
? ($CDS_S{$tid}, $CDS_E{$tid})
: ($txS, $txS);
my $chrom = $CHR{$tid} // 'chrN';
my $strand = $STR{$tid} // '+';
my $name = $NAME{$tid} // $tid;
my $sizes = join(",", @blockSizes); # 無尾逗號
my $starts = join(",", @blockStarts); # 無尾逗號
print join("\t",
$chrom, $txS, $txE, $name, 0, $strand,
$thickS, $thickE, 0,
scalar(@ex),
$sizes,
$starts
), "\n";
}