file="Orthogroups/Orthogroups.tsv"
ogs={}
with open(file) as f:
l = 0
for line in f:
l+=1
line=[line.strip() for line in line.split('\t')]
if l == 1:
species = line[1:]
continue
z = [z.split(', ') for z in line[1:]]
ogs[line[0]] = dict(zip(species,z))
>>> len(ogs)
20667
>>> a = ogs["OG0020664"]
>>> a
{'Averrhoa_carambola': [''], 'Carica_papaya': [''], 'Coffea_canephora': [''], 'Prunus_avium': [''], 'Prunus_persica': [''], 'Ricinus_communis': [''], 'Theobroma_cacao': [''], 'Vitis_vinifera': ['GSVIVG01006304001', 'GSVIVG01006461001']}
>>> a['Vitis_vinifera']
['GSVIVG01006304001', 'GSVIVG01006461001']
持續(xù)更新......
針對(duì)OrthoFinder的結(jié)果Results_*/Orthogroups/Orthogroups.tsv進(jìn)行處理。
Python學(xué)的太差了,給自己定幾個(gè)題目,進(jìn)行學(xué)習(xí):
- 任意提取某個(gè)OG下某個(gè)物種的所有基因
- 統(tǒng)計(jì)各個(gè)物種特有OG下的基因
- 統(tǒng)計(jì)單拷貝OG
- 統(tǒng)計(jì)各個(gè)OG單拷貝率
...
隨著代碼的掌握,進(jìn)行優(yōu)化。
singleogs=[]
a=[]
for og,spgenes in ogs.items(): # 每個(gè)OG進(jìn)行一次循環(huán)
i=0
for sp,genes in spgenes.items(): # 每個(gè)OG的每個(gè)物種進(jìn)行一次循環(huán)
genes = [ i for i in genes if i != '']
if len(genes) == 1:
i+=1 # 如果物種的基因數(shù)目等于1,i加1
if i == len(ogs[og]): # 最終i的數(shù)目等于 len(ogs[og])
singleogs.append(og)
>>> len(singleogs)
4298
>>> ogs[singleogs[1]]
{'Averrhoa_carambola': ['geneYangtao2006611'], 'Carica_papaya': ['110807233'], 'Coffea_canephora': ['Cc02_g35740'], 'Prunus_avium': ['gene-LOC110759850'], 'Prunus_persica': ['18784265'], 'Ricinus_communis': ['J2O13_05G011667'], 'Theobroma_cacao': ['Thecc1EG019630'], 'Vitis_vinifera': ['GSVIVG01036485001']}