篩選出兩個(gè)分支的菌株的顯著性差異為點(diǎn)的Python代碼

import numpy
import collections
import scipy.stats
import math
import vcf
###讀取SNP位點(diǎn)
pos_list=[]
vcf_reader=vcf.Reader(filename=r'suis.vcf')
for record in vcf_reader:
    pos_list.append(int(record.POS))


branch1_snp_list=[]
branch2_snp_list=[]
total_snp_list=[]
#讀取fasta文件
with open(r'suis_branch1.fasta') as f:
    for line in f:
        if line.startswith('>'):
            continue
        else:
            snp=line.replace('\n','')
            snp=str(snp)
            branch1_snp_list.append(list(snp))
            total_snp_list.append(list(snp))
with open(r'suis_branch2.fasta') as f:
    for line in f:
        if line.startswith('>'):
            continue
        else:
            snp=line.replace('\n','')
            snp = str(snp)
            branch2_snp_list.append(list(snp))
            total_snp_list.append(list(snp))
branch1_array=[]
branch2_array=[]
total_array=[]
array_len=len(branch1_snp_list[0])
print("一共有%d個(gè)位點(diǎn)" % array_len)
for i in range(0,array_len):
    branch1_array.append(0)
    branch2_array.append(0)
    total_array.append(0)
branch1_array=numpy.array(branch1_array)
branch2_array=numpy.array(branch2_array)
total_array=numpy.array(total_array)
#將兩個(gè)分支的序列合并成矩陣
for i in branch1_snp_list:
    i=numpy.array(i)
    branch1_array=numpy.column_stack((branch1_array,i))
branch1_array = numpy.delete(branch1_array, 0, axis=1)###矩陣的列數(shù)為branch1內(nèi)菌株的個(gè)數(shù),行數(shù)為總共SNP位點(diǎn)的個(gè)數(shù)

for i in branch2_snp_list:
    i=numpy.array(i)
    branch2_array=numpy.column_stack((branch2_array,i))
branch2_array = numpy.delete(branch2_array, 0, axis=1)

for i in total_snp_list:
    i=numpy.array(i)
    total_array=numpy.column_stack((total_array,i))
total_array = numpy.delete(total_array, 0, axis=1)

###卡方檢驗(yàn)函數(shù)
def kafang(a,b,c,d,thre,index):
    n=(a+b+c+d)*1.0
    branch1=(a+b)*1.0
    branch2=(c+d)*1.0
    major=(a+c)*1.0
    minor=(b+d)*1.0
    TRC_a=branch1*(major/n)
    TRC_b=branch1-TRC_a
    TRC_c=branch2*(major/n)
    TRC_d=branch2-TRC_c
    out_res=0
    X2=0
    Pval=0
    ###判斷理論數(shù)的大小,以選擇不同的卡方檢驗(yàn)
    if(n>=40):
        if(TRC_a>=5 and TRC_b>=5 and TRC_c>=5 and TRC_d >=5):
            X2=math.pow((a-TRC_a),2)/TRC_a+math.pow((b-TRC_b),2)/TRC_b+math.pow((c-TRC_c),2)/TRC_c+math.pow((d-TRC_d),2)/TRC_d
            if(thre==0.05):
                if(X2>=3.84):
                    out_res=1
            elif(thre==0.01):
                if(X2>=6.64):
                    out_res=1
        elif(TRC_a>=1 and TRC_b>=1 and TRC_c>=1 and TRC_d>=1):
            X2=math.pow((abs(a-TRC_a)-0.5),2)/TRC_a+math.pow((abs(b-TRC_b)-0.5),2)/TRC_b+math.pow((abs(c-TRC_c)-0.5),2)/TRC_c+math.pow((abs(d-TRC_d)-0.5),2)/TRC_d
            if (thre == 0.05):
                if (X2 >= 3.84):
                    out_res = 1
            elif (thre == 0.01):
                if (X2 >= 6.64):
                    out_res = 1
        else:
            Pval=scipy.stats.fisher_exact([[a,b],[c,d]])[1]
            if(thre==0.05):
                if(Pval<0.05):
                    out_res=1
            elif(thre==0.01):
                if(Pval<0.01):
                    out_res=1
    else:
        Pval = scipy.stats.fisher_exact([[a, b], [c, d]])[1]
        if (thre == 0.05):
            if (Pval < 0.05):
                out_res = 1
        elif (thre == 0.01):
            if (Pval < 0.01):
                out_res = 1
    with open(r'test.log','a+') as f:###輸出所有位點(diǎn)的記錄
        print("snp_pos:",pos_list[index],file=f)
        if(X2==0):
            print("Pval:",Pval,file=f)
        else:
            print("X2:", X2,file=f)
        print("num\tmajor_allele\tminor_allele",file=f)
        print("bran1\t%d\t%d" % (a,b),file=f)
        print("bran2\t%d\t%d" % (c,d),file=f)
    finall_out=0
    if(out_res==1):
        if(a>b and d>c):
            if(a/branch1>=0.8 and d/branch2>=0.8):
                finall_out = 1
                with open(r'diff_pos.log','a+') as f:
                    print("snp_pos:", pos_list[index], file=f)
                    if (X2 == 0):
                        print("Pval:", Pval, file=f)
                    else:
                        print("X2:", X2, file=f)
                    print("num\tmajor_allele\tminor_allele", file=f)
                    print("bran1\t%d\t%d" % (a, b), file=f)
                    print("bran2\t%d\t%d" % (c, d), file=f)
        if(b>a and c>d):
            if(b/branch1>=0.8 and c/branch2>=0.8):
                finall_out = 1
                with open(r'diff_pos.log','a+') as f:
                    print("snp_pos:", pos_list[index], file=f)
                    if (X2 == 0):
                        print("Pval:", Pval, file=f)
                    else:
                        print("X2:", X2, file=f)
                    print("num\tmajor_allele\tminor_allele", file=f)
                    print("bran1\t%d\t%d" % (a, b), file=f)
                    print("bran2\t%d\t%d" % (c, d), file=f)
    return finall_out



###選出兩個(gè)分支矩陣中的兩種堿基及其對(duì)應(yīng)的個(gè)數(shù)
for i in range(0,len(total_array)):
    total_most_common=collections.Counter(total_array[i]).most_common()
    if len(total_most_common)==2:
        major_base=total_most_common[0][0]
        minor_base=total_most_common[1][0]
        branch1_most_common=collections.Counter(branch1_array[i]).most_common()
        branch2_most_common = collections.Counter(branch2_array[i]).most_common()
        ###對(duì)branch1進(jìn)行處理
        if len(branch1_most_common)==1:
            if branch1_most_common[0][0]==major_base:
                branch1_major_num=branch1_most_common[0][1]
                branch1_minor_num=0
            else:
                branch1_major_num = 0
                branch1_minor_num = branch1_most_common[0][1]
        else:
            if branch1_most_common[0][0]==major_base:
                branch1_major_num = branch1_most_common[0][1]
                branch1_minor_num = branch1_most_common[1][1]
            else:
                branch1_major_num = branch1_most_common[1][1]
                branch1_minor_num = branch1_most_common[0][1]
        ###對(duì)branch2進(jìn)行處理
        if len(branch2_most_common)==1:
            if branch2_most_common[0][0]==major_base:
                branch2_major_num=branch2_most_common[0][1]
                branch2_minor_num=0
            else:
                branch2_major_num = 0
                branch2_minor_num = branch2_most_common[0][1]
        else:
            if branch2_most_common[0][0]==major_base:
                branch2_major_num = branch2_most_common[0][1]
                branch2_minor_num = branch2_most_common[1][1]
            else:
                branch2_major_num = branch2_most_common[1][1]
                branch2_minor_num = branch2_most_common[0][1]
        ###進(jìn)行卡方檢驗(yàn)
        finall_out=kafang(a=branch1_major_num,b=branch1_minor_num,c=branch2_major_num,d=branch2_minor_num,thre=0.05,index=i)
        if(finall_out==1):
            with open(r'sign_diff_pos', 'a+') as f:
                chrome="FM252032.1"
                print("%s\t%d" % (chrome,pos_list[i]),file=f)
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請(qǐng)結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡書系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

友情鏈接更多精彩內(nèi)容