嘮嘮叨叨：

因?yàn)樘幚鞟TAC-seq的原始fastq數(shù)據(jù)太大（已知GEO號(hào)在ncbi上下載），所以被老師要求用python編程分割fastq腳本，其實(shí)用軟件seqkit就可以進(jìn)行分割，命令簡單還快速，想分多少份就分多少份，但老師說我不練怎么會(huì)編程呢，所以沒辦法，我這剛?cè)腴T的小白，就在網(wǎng)上各種查資料，發(fā)現(xiàn)都是處理fasta格式的，因?yàn)閒astq格式和fasta格式并不一樣，不知道可以借鑒不，日夜研究別人的代碼，以及翻找網(wǎng)絡(luò)，讀到一篇將fastq文件寫入和讀出的代碼，理解并做出了嘗試，然后又絞盡腦汁如何進(jìn)行分割，后來想到按行分配，只是需要每次都查看一下文件有多少行，然后又搜了一下代碼，想把兩者結(jié)合起來，中間報(bào)了各種error,就反復(fù)調(diào)試，7.1的那一天上午成功了，但是我取的是10000行，不知道生成的文件為什么是7000多行，沒找到原因，沒有打開生成的文件看看，下午老師過來看我發(fā)現(xiàn)是沒有加換行符的問題，實(shí)際上輸出是對(duì)的

分割fastq腳本：

需要引入時(shí)間函數(shù)（此處參考網(wǎng)上），可以看腳本分割用了多久

gz文件打開需要引入gzip函數(shù)

from datetime import datetime

import re

import sys

def Main():#定義函數(shù)，讀入fastq文件

? ? source_dir = sys.argv[1] #傳入的參數(shù)1：目標(biāo)文件夾所在目錄

? ? read_number=int(sys.argv[2]) #傳入的參數(shù)2：需要分割的行數(shù)，需要是4的倍數(shù)，腳本里需要驗(yàn)證一下

? ? target_dir = sys.argv[3]#傳入的參數(shù)3：分割后文件所在的目錄

? ? #print(target_dir)

? ? fas={}

? ? id=None

? ? name = 1

? ? N=0 #讀取數(shù)據(jù)的行數(shù)數(shù)值，初始值為0

? ? if read_number%4 != 0:

? ? ? print("參數(shù)錯(cuò)誤，程序退出")

? ? ? sys.exit()

? ? else:

? ? ? print("開始。。。。。")

? ? ? print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

? ? ? with gzip.open(source_dir,'rb') as f:

? ? ? ? ? for line in f:

? ? ? ? ? ? N+=1

? ? ? ? ? ? if line.decode()[0]=="@":

? ? ? ? ? ? ? ? id=line.decode().strip().split()[0][1:]#對(duì)二進(jìn)制文件進(jìn)行解碼

? ? ? ? ? ? ? ? fas[id]=[]? #將該id對(duì)應(yīng)的初始值為空

? ? ? ? ? ? else:

? ? ? ? ? ? ? ? seq=fas[id].append(line.decode())?

? ? ? ? ? ? ? ? #print(fas)

? ? ? ? ? ? if N==read_number:

? ? ? ? ? ? #? with gzip.open(target_dir+'/57.%04d.fq.gz'%(name),'wb') as out:

? ? ? ? ? ? ? ? #? print(target_dir+'/57.%04d.fq.gz'%(name))

? ? ? ? ? ? ? ? ? ? if re.findall('r1', source_dir, flags=re.IGNORECASE):? #正則匹配可以不區(qū)分大小寫或直接判斷R1是否在source_dir里，需大寫：if R in source_dir:

? ? ? ? ? ? ? ? ? ? ? ? with gzip.open(target_dir+'.R1.%04d.fq.gz'%(name),'wb') as out:

? ? ? ? ? ? ? ? ? ? ? ? ? ? print(target_dir+'.R1.%04d.fq.gz'%(name))

? ? ? ? ? ? ? ? ? ? ? ? ? ? for id,seq in fas.items():?

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? res = '@'+id+"/1"+"\n"+seq[0]+seq[1]+seq[2]

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? out.write(res.encode())

? ? ? ? ? ? ? ? ? ? ? ? name+=1

? ? ? ? ? ? ? ? ? ? ? ? N=0

? ? ? ? ? ? ? ? ? ? ? ? fas={}

? ? ? ? ? ? ? ? ? ? else:

? ? ? ? ? ? ? ? ? ? ? ? with gzip.open(target_dir+'.R2.%04d.fq.gz'%(name),'wb') as out:

? ? ? ? ? ? ? ? ? ? ? ? ? ? print(target_dir+'.R2.%04d.fq.gz'%(name))

? ? ? ? ? ? ? ? ? ? ? ? ? ? for id,seq in fas.items():

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? res = '@'+id+"/2"+"\n"+seq[0]+seq[1]+seq[2]

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? out.write(res.encode())

? ? ? ? ? ? ? ? ? ? ? ? name+=1

? ? ? ? ? ? ? ? ? ? ? ? N=0

? ? ? ? ? ? ? ? ? ? ? ? fas={}

? ? ? ? ? ? else:

? ? ? ? ? ? ? ? ? continue

? ? ? print("完成。。。。。")

? ? ? print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

if __name__ == "__main__":

? ? Main()

優(yōu)化：此腳本由我的導(dǎo)師提出幾個(gè)優(yōu)化步驟又進(jìn)行改進(jìn)，以上是完整版，改進(jìn)為：一傳入?yún)?shù)；二對(duì)傳入的文件名進(jìn)行判斷是含R1還是R2，如果含R1在文件的read_name后加上/1，反之R2加上/2；三將輸出的文件名的數(shù)固定住

對(duì)于我的腳本有一些缺陷：如果文件生于的行數(shù)怎么辦，雖然此腳本有考慮到，但我沒運(yùn)行成功，二就是分割的時(shí)間太慢

7月的第一天，我一個(gè)小白能寫出腳本感覺異常欣慰，二是班長通知六級(jí)報(bào)名，而我去年六級(jí)竟然裸考過了，終于不用再去奔赴六級(jí)的考場(chǎng)了，哈哈，開心，感謝我讀過的英文文獻(xiàn)

老師版：

第二天我們老師在我旁邊又教我寫出另一個(gè)版本，即按read數(shù)進(jìn)行分割fastq文件，不得不說我們老師真厲害，代碼比我簡潔多了,而且教我寫出只用了一個(gè)小時(shí)，時(shí)間上也會(huì)慢些，應(yīng)該是文件較大，但是不用考慮剩余行的問題，只需要提前計(jì)算好分割的reads數(shù)

import gzip

from datetime import datetime

import re

import sys

def Main():

? ? test_R1=False

? ? test_R2=False

? ? source_dir = sys.argv[1]

? ? read_number=int(sys.argv[2])

? ? target_dir = sys.argv[3]

? ? #print(target_dir)

? ? fas={}

? ? id=None

? ? name = 1

? ? N=0 #讀取數(shù)據(jù)的行數(shù)數(shù)值，初始值為0

? ? print("開始。。。。。")

? ? print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

? ? with gzip.open(source_dir,'rb') as f:

? ? ? for line in f:

? ? ? ? if N%(4*read_number)==0:

? ? ? ? ? ? if re.findall('r1', source_dir, flags=re.IGNORECASE):

? ? ? ? ? ? ? ? ? f_out = gzip.open(target_dir+'.R1.%04d.fq.gz'%(name),'wb')

? ? ? ? ? ? ? ? ? print(target_dir+'.R1.%04d.fq.gz'%(name))

? ? ? ? ? ? ? ? ? test_R1 = True

? ? ? ? ? ? ? ? ? suffix="/1"

? ? ? ? ? ? elif re.findall('r2', source_dir, flags=re.IGNORECASE):

? ? ? ? ? ? ? ? ? f_out = gzip.open(target_dir+'.R2.%04d.fq.gz'%(name),'wb')

? ? ? ? ? ? ? ? ? print(target_dir+'.R2.%04d.fq.gz'%(name))

? ? ? ? ? ? ? ? ? test_R2= True

? ? ? ? ? ? ? ? ? suffix="/2"

? ? ? ? ? ? else:

? ? ? ? ? ? ? ? ? print("Error: R1 or R2 in filename not detected，exit")

? ? ? ? ? ? ? ? ? sys.exit()

? ? ? ? ? ? name+=1?

? ? ? ? N+=1

? ? ? ? if N%4==1:

? ? ? ? ? ? if test_R1:

? ? ? ? ? ? ? ? if re.findall('/1', line.decode(), flags=re.IGNORECASE):

? ? ? ? ? ? ? ? ? ? f_out.write(line)

? ? ? ? ? ? ? ? else:

? ? ? ? ? ? ? ? ? ? f_out.write((line.decode().strip().split()[0]+suffix+"\n").encode())

? ? ? ? ? ? if test_R2:

? ? ? ? ? ? ? ? ? if re.findall('/2', line.decode(), flags=re.IGNORECASE):

? ? ? ? ? ? ? ? ? ? f_out.write(line)

? ? ? ? ? ? ? ? ? else:

? ? ? ? ? ? ? ? ? ? f_out.write((line.decode().strip().split()[0]+suffix+"\n").encode())

? ? ? ? elif N%4==3:

? ? ? ? ? ? f_out.write(("+\n").encode())

? ? ? ? else:

? ? ? ? ? ? f_out.write(line)

? ? print("完成。。。。。")

? ? print(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

if __name__ == "__main__":

? ? Main()

這兩段代碼都已運(yùn)行成功

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av

用python分割fastq文件的腳本（自寫）

用python分割fastq文件的腳本（自寫）

嘮嘮叨叨：

分割fastq腳本：

老師版：

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九 欧美,1769亚洲,黄色成人av

用python分割fastq文件的腳本（自寫）

嘮嘮叨叨：

分割fastq腳本：

老師版：

友情鏈接更多精彩內(nèi)容

色偷偷精品伊人,欧洲久久精品,欧美综合婷婷骚逼,国产AV主播,国产最新探花在线,九色在线视频一区,伊人大交九欧美,1769亚洲,黄色成人av