提取 genecode的gtf注釋信息

讀入數(shù)據(jù)

gtf <- rtracklayer::import('gencode.v22.annotation.gtf')#自行下載gtf注釋文件
gtf_df=as.data.frame(gtf) #轉化為矩陣 這一步就可以隨意操作了
head(gtf_df)
  seqnames start   end width strand source       type score phase           gene_id
1     chr1 11869 14409  2541      + HAVANA       gene    NA    NA ENSG00000223972.5
2     chr1 11869 14409  2541      + HAVANA transcript    NA    NA ENSG00000223972.5
3     chr1 11869 12227   359      + HAVANA       exon    NA    NA ENSG00000223972.5
4     chr1 12613 12721   109      + HAVANA       exon    NA    NA ENSG00000223972.5
5     chr1 13221 14409  1189      + HAVANA       exon    NA    NA ENSG00000223972.5
6     chr1 12010 13670  1661      + HAVANA transcript    NA    NA ENSG00000223972.5
                           gene_type gene_status gene_name level          havana_gene     transcript_id
1 transcribed_unprocessed_pseudogene       KNOWN   DDX11L1     2 OTTHUMG00000000961.2              <NA>
2 transcribed_unprocessed_pseudogene       KNOWN   DDX11L1     2 OTTHUMG00000000961.2 ENST00000456328.2
3 transcribed_unprocessed_pseudogene       KNOWN   DDX11L1     2 OTTHUMG00000000961.2 ENST00000456328.2
4 transcribed_unprocessed_pseudogene       KNOWN   DDX11L1     2 OTTHUMG00000000961.2 ENST00000456328.2
5 transcribed_unprocessed_pseudogene       KNOWN   DDX11L1     2 OTTHUMG00000000961.2 ENST00000456328.2
6 transcribed_unprocessed_pseudogene       KNOWN   DDX11L1     2 OTTHUMG00000000961.2 ENST00000450305.2
                     transcript_type transcript_status transcript_name   tag transcript_support_level
1                               <NA>              <NA>            <NA>  <NA>                     <NA>
2               processed_transcript             KNOWN     DDX11L1-002 basic                        1
3               processed_transcript             KNOWN     DDX11L1-002 basic                        1
4               processed_transcript             KNOWN     DDX11L1-002 basic                        1
5               processed_transcript             KNOWN     DDX11L1-002 basic                        1
6 transcribed_unprocessed_pseudogene             KNOWN     DDX11L1-001 basic                       NA
     havana_transcript exon_number           exon_id         ont protein_id ccdsid
1                 <NA>        <NA>              <NA>        <NA>       <NA>   <NA>
2 OTTHUMT00000362751.1        <NA>              <NA>        <NA>       <NA>   <NA>
3 OTTHUMT00000362751.1           1 ENSE00002234944.1        <NA>       <NA>   <NA>
4 OTTHUMT00000362751.1           2 ENSE00003582793.1        <NA>       <NA>   <NA>
5 OTTHUMT00000362751.1           3 ENSE00002312635.1        <NA>       <NA>   <NA>
6 OTTHUMT00000002844.2        <NA>              <NA> PGO:0000019       <NA>   <NA>

提取gene信息

gene<-gtf_df[gtf_df$type=="gene",]
head(gene)
   seqnames start   end width strand  source type score phase           gene_id
1      chr1 11869 14409  2541      +  HAVANA gene    NA    NA ENSG00000223972.5
13     chr1 14404 29570 15167      -  HAVANA gene    NA    NA ENSG00000227232.5
26     chr1 17369 17436    68      - ENSEMBL gene    NA    NA ENSG00000278267.1
29     chr1 29554 31109  1556      +  HAVANA gene    NA    NA ENSG00000243485.3
37     chr1 30366 30503   138      + ENSEMBL gene    NA    NA ENSG00000274890.1
40     chr1 34554 36081  1528      -  HAVANA gene    NA    NA ENSG00000237613.2
                            gene_type gene_status    gene_name level          havana_gene transcript_id
1  transcribed_unprocessed_pseudogene       KNOWN      DDX11L1     2 OTTHUMG00000000961.2          <NA>
13             unprocessed_pseudogene       KNOWN       WASH7P     2 OTTHUMG00000000958.1          <NA>
26                              miRNA       KNOWN    MIR6859-3     3                 <NA>          <NA>
29                            lincRNA       NOVEL RP11-34P13.3     2 OTTHUMG00000000959.2          <NA>
37                              miRNA       KNOWN    MIR1302-9     3                 <NA>          <NA>
40                            lincRNA       KNOWN      FAM138A     2 OTTHUMG00000000960.1          <NA>
   transcript_type transcript_status transcript_name        tag transcript_support_level havana_transcript
1             <NA>              <NA>            <NA>       <NA>                     <NA>              <NA>
13            <NA>              <NA>            <NA>       <NA>                     <NA>              <NA>
26            <NA>              <NA>            <NA>       <NA>                     <NA>              <NA>
29            <NA>              <NA>            <NA> ncRNA_host                     <NA>              <NA>
37            <NA>              <NA>            <NA>       <NA>                     <NA>              <NA>
40            <NA>              <NA>            <NA>       <NA>                     <NA>              <NA>
   exon_number exon_id  ont protein_id ccdsid
1         <NA>    <NA> <NA>       <NA>   <NA>
13        <NA>    <NA> <NA>       <NA>   <NA>
26        <NA>    <NA> <NA>       <NA>   <NA>
29        <NA>    <NA> <NA>       <NA>   <NA>
37        <NA>    <NA> <NA>       <NA>   <NA>
40        <NA>    <NA> <NA>       <NA>   <NA>

獲取想要的信息

colnames(gene)
 [1] "seqnames"                 "start"                    "end"                     
 [4] "width"                    "strand"                   "source"                  
 [7] "type"                     "score"                    "phase"                   
[10] "gene_id"                  "gene_type"                "gene_status"             
[13] "gene_name"                "level"                    "havana_gene"             
[16] "transcript_id"            "transcript_type"          "transcript_status"       
[19] "transcript_name"          "tag"                      "transcript_support_level"
[22] "havana_transcript"        "exon_number"              "exon_id"                 
[25] "ont"                      "protein_id"               "ccdsid"   
pick_info<-c("seqnames","start","end","width","strand","gene_id","gene_name")#提取自己想要的列
ann<-gene[,pick_info]
row.names(ann)<-as.character(ann$gene_id)
head(ann)
                  seqnames start   end width strand           gene_id    gene_name
ENSG00000223972.5     chr1 11869 14409  2541      + ENSG00000223972.5      DDX11L1
ENSG00000227232.5     chr1 14404 29570 15167      - ENSG00000227232.5       WASH7P
ENSG00000278267.1     chr1 17369 17436    68      - ENSG00000278267.1    MIR6859-3
ENSG00000243485.3     chr1 29554 31109  1556      + ENSG00000243485.3 RP11-34P13.3
ENSG00000274890.1     chr1 30366 30503   138      + ENSG00000274890.1    MIR1302-9
ENSG00000237613.2     chr1 34554 36081  1528      - ENSG00000237613.2      FAM138A
write.csv(ann,"gencode_v22_annotation_gene.csv") #輸出結果
gtf <- rtracklayer::import('gencode.vM24.annotation.gtf')
gtf_df=as.data.frame(gtf)
gene<-gtf_df[gtf_df$type=="transcript",]
pick_info<-c("seqnames","start","end","width","strand","gene_id","gene_name","gene_type","transcript_id")
gtf_df_pick<-gene[,pick_info]
write.csv(gtf_df_pick,"~/Desktop/GoogleDrive/Annotation/gencode.vM24.annotation.csv")

寫在最后的話

很多大神用perl和python來提取,對于文本提取這兩個語言有很大的優(yōu)勢,不過需要花時間取理解。平時常用R,同時實驗也比較多,所以就用R來做,還沒入坑的小朋友可以學python,會比較好很多。

最后編輯于
?著作權歸作者所有,轉載或內容合作請聯(lián)系作者
【社區(qū)內容提示】社區(qū)部分內容疑似由AI輔助生成,瀏覽時請結合常識與多方信息審慎甄別。
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發(fā)布,文章內容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務。

相關閱讀更多精彩內容

友情鏈接更多精彩內容