病情說明:
雖然我是單細(xì)胞師兄,但是也做過宏基因組、代謝組和單細(xì)胞轉(zhuǎn)錄組的聯(lián)合分析。文末會(huì)提供kneaddata的代碼,聯(lián)系我可以獲得docker鏡像和分析數(shù)據(jù)哦?。≡诖?,我先說明一下如果使用你們自己的數(shù)據(jù),我們先要把數(shù)據(jù)整理成這種格式:
- 那就是需要把fastq文件準(zhǔn)備成這個(gè)樣子(序列id相同,/1 /2用來區(qū)分雙端測序數(shù)據(jù)),這個(gè)一定要注意!??!不然kneaddata可以正常運(yùn)行,但是clean data為空。
-
文件名為sampleid_R1.fastq.gz和sampleid_R2.fastq.gz
image.png
image.png
治療方案
import time
import threading
import argparse
import os
from pickle import FALSE
import threading
import shutil
import datetime
import json
import time
import subprocess
import pandas as pd
method_path = 'plot_scripts'
refer_path = 'DB'
# 1: check function
def knead_check(data_path: str = None, ## String: path to offline data
refer_path: str = None, ## String: path to database
species: str = None, ## String: species: {human} and {mouse}
out_path: str = None ## String: output path for knead procedure
):
## check sampleinfo file
sampleinfo = f'{data_path}/sample.txt'
if not os.path.exists(sampleinfo):
raise SystemExit(' No sampleinfo file ! Please check datasets !')
else:
pass
## choose species
kneaddata_database = refer_path + '/kneaddb'
if species == 'human':
kneaddb = kneaddata_database + '/human_hg19/hg19'
elif species == 'mouse':
kneaddb = kneaddata_database + '/mouse_C57BL/mouse_C57BL_6NJ'
else:
pass
if not os.path.exists(out_path):
os.makedirs(out_path, exist_ok=True)
os.system(f'cp {sampleinfo} {out_path}')
else:
pass
## output
check_file_path = f'{out_path}/knead_check_file.txt'
check_file = pd.DataFrame([
data_path,
species,
kneaddb,
sampleinfo
])
check_file.to_csv(check_file_path,
header=None, index=None, sep='\t', mode='w')
return 0
class FunctionStatusChecker:
def __init__(self, func: callable):
self.func = func
self.start_time = None
self.end_time = None
self.lock = threading.Lock()
def __call__(self, *args, **kwargs):
self.start_time = time.time()
result = self.func(*args, **kwargs)
self.end_time = time.time()
return result
def is_executed(self) -> bool:
with self.lock:
return self.start_time is not None and self.end_time is not None
@FunctionStatusChecker
def execCmd(cmd):
try:
print("命令{}開始運(yùn)行{}".format(cmd, datetime.datetime.now()))
time.sleep(5)
os.system(cmd)
print("命令{}結(jié)束運(yùn)行{}".format(cmd, datetime.datetime.now()))
except:
print("{}\t運(yùn)行失敗".format(cmd))
@FunctionStatusChecker
def RunKnead(out_path:str = None ## String: output path for knead procedure)
):
## set parameters
knead_check = pd.read_csv(f'{out_path}/knead_check_file.txt', header=None,delimiter='\t')
data_path = knead_check.at[0,0]
species = knead_check.at[1,0]
kneaddb = knead_check.at[2,0]
sampleinfo = knead_check.at[3,0]
sample_info_dic = {}
cmds = []
for line in open(sampleinfo):
if not line.startswith("sample"):
line=line.strip().split()
sample_info_dic[line[0]]=line[1]
cmd= f'''/home/soft/miniconda3/envs/Rdoc/bin/kneaddata -t 6 -v \
-i1 {data_path}/{line[0]}_R1.fastq.gz \
-i2 {data_path}/{line[0]}_R2.fastq.gz \
-o {out_path}/{line[0]} -db {kneaddb} \
--cat-final-output --output-prefix {line[0]} --serial \
--trimmomatic /home/soft/Trimmomatic-0.39/ \
--trimmomatic-options 'SLIDINGWINDOW:4:20 MINLEN:50' \
--bowtie2 /home/soft/miniconda3/envs/Rdoc/bin/ \
--bowtie2-options '--very-sensitive --dovetail' \
--remove-intermediate-output \
--fastqc /home/soft/miniconda3/envs/Rdoc/bin/ \
--trf /home/soft/miniconda3/envs/Rdoc/bin/ \
--run-fastqc-start \
--run-fastqc-end '''
cmds.append(cmd)
threads = []
for cmd in cmds:
th = threading.Thread(target=execCmd, args=(cmd,))
th.start()
th.join()
time.sleep(2)
threads.append(th)
return out_path
@FunctionStatusChecker
def Run_Stats(out_path:str = None ## String: output path for knead procedure
):
StatsDir = out_path + '/readsinfo'
if os.path.exists(StatsDir):
pass
else:
os.mkdir(StatsDir)
x= [item for item in os.walk(out_path)]
for path, dirs, files in x:
for file in files:
if file.endswith('.log'):
shutil.copy(path+os.sep+file,StatsDir)
cmd = f"cd {out_path} && kneaddata_read_count_table --input readsinfo/ --output kneaddata_read_count_table.tsv && \
cut -f 1-5,12-13 kneaddata_read_count_table.tsv | sed 's/_R1_kneaddata//;s/pair//g' > kneaddata_report.txt "
execCmd(cmd)
def knead_call(out_path:str = None ## String: output path for knead procedure
):
function_checker = RunKnead
result = function_checker(out_path)
cleandata_path = os.path.abspath(out_path)
sampleinfo = os.path.abspath(out_path)+'/sample.txt'
## output
call_file_path = f'{out_path}/knead_call_file.txt'
call_file = pd.DataFrame([cleandata_path,sampleinfo])
call_file.to_csv(call_file_path,
header=None, index=None, sep='\t', mode='w')
step1 = function_checker.is_executed()
if step1 is True:
function_checker2 = Run_Stats
result = function_checker2(out_path)
#print(result)
step2 = function_checker.is_executed()
else:
wait()
return 0
def knead_plot(out_path:str = None ## String: output path for knead procedure
):
cmd = f" Rscript {method_path}/knead_plot.R -w {out_path} "
execCmd(cmd)
return 0
data_path = './RawData/dre'
species= 'mouse'
out_path ='kneaddout'
knead_check(data_path,refer_path,species,out_path)
knead_call('kneaddout')
knead_plot('kneaddout')
3. 補(bǔ)充內(nèi)容
如果需要的話,我可以提供打包好的宏基因組分析docker鏡像和測試數(shù)據(jù),另外持續(xù)關(guān)注哦,因?yàn)槲視?huì)持續(xù)更新 ?。?!

