基因的chr,start,end都是已知的 (這個(gè)文件需要下載,可以是UCSC,或者NCBI),也就是TSS文件。
測試文件見我的github.
import sys#引入sys模塊
import os
os.chdir('D:/python/question10')
args = sys.argv#調(diào)用命令行參數(shù)
class Genome_info:#創(chuàng)建類Genome_info
def __init__(self):
self.chr = ""
self.start = 0
self.end = 0#初始化屬性
class Gene(Genome_info):#創(chuàng)建父類Genome_info之下的類Gene
def __init__(self):
Genome_info.__init__(self)
self.orientation = ""
self.id = "" #初始化屬性
list_chr = {} #定義染色體列表
with open('TSS.bed') as fp_gene: #導(dǎo)入?yún)?shù)1,即TSS.txt
for line in fp_gene:
if line.startswith("#"): #如果某行以#開頭則越過
continue
lines = line.strip("\n").split("\t") #每行去除換行,以制表符分割
id = lines[0] #第一欄為基因id
chr = lines[1] #第二欄為染色體號
start = int(lines[2]) #第三欄轉(zhuǎn)為整數(shù)型
end = int(lines[3]) #第四欄轉(zhuǎn)為整數(shù)型
orientation = lines[4] #第五欄為基因方向
if not chr in list_chr: #如果染色體號在列表里不存在就初始化一下
list_chr[chr] = {}
gene = Gene() #初始化基因
gene.chr= chr #初始化染色體
gene.start = start #初始化基因起始位點(diǎn)
gene.end = end #初始化基因結(jié)束位點(diǎn)
gene.id = id #初始化ID
gene.orientation = orientation #初始化基因方向
list_chr[chr][id] = gene #將基因鍵、值存入list_chr字典
with open('pos.txt') as fp_pos: #導(dǎo)入?yún)?shù)2,即pos.txt
for line in fp_pos:
gene_list = [] #初始化gene_list
lines = line.strip('\n').split('\t') #每行去除換行,用制表符分割
(chr, start, end) = (lines[0], int(lines[1]), int(lines[2])) #取出染色體號,起始坐標(biāo)和結(jié)束坐標(biāo),后兩者均轉(zhuǎn)為整數(shù)型
for gene_id, gene in list_chr[chr].items(): #判斷pos.txt中基因位置與TSS.txt中是否有重疊
if gene.start <= start <= gene.end or gene.start <= end <= gene.end or start <= gene.start <= end or start <= gene.end <= end:
gene_list.append(gene.id) #如有則將基因ID添加至列表gene_list
print(gene_list) #輸出gene_list