宏基因組分析(一)kneaddata質(zhì)控及代碼分享

病情說明:

雖然我是單細(xì)胞師兄,但是也做過宏基因組、代謝組和單細(xì)胞轉(zhuǎn)錄組的聯(lián)合分析。文末會(huì)提供kneaddata的代碼,聯(lián)系我可以獲得docker鏡像和分析數(shù)據(jù)哦?。≡诖?,我先說明一下如果使用你們自己的數(shù)據(jù),我們先要把數(shù)據(jù)整理成這種格式:

  • 那就是需要把fastq文件準(zhǔn)備成這個(gè)樣子(序列id相同,/1 /2用來區(qū)分雙端測序數(shù)據(jù)),這個(gè)一定要注意!??!不然kneaddata可以正常運(yùn)行,但是clean data為空。
  • 文件名為sampleid_R1.fastq.gz和sampleid_R2.fastq.gz


    image.png

    image.png

治療方案

import time
import threading
import argparse
import os
from pickle import FALSE
import threading
import shutil
import datetime
import json
import time
import subprocess
import pandas as pd

method_path = 'plot_scripts'
refer_path = 'DB'

# 1: check function
def knead_check(data_path: str = None,                ## String: path to offline data
                refer_path: str = None,               ## String: path to database
                species: str = None,                  ## String: species: {human} and {mouse}
                out_path: str = None                  ## String: output path for knead procedure
):
    ## check sampleinfo file
    sampleinfo = f'{data_path}/sample.txt'
    if not os.path.exists(sampleinfo):
        raise SystemExit(' No sampleinfo file ! Please check datasets !')
    else:
        pass

    ## choose species
    kneaddata_database = refer_path + '/kneaddb'
    if species == 'human':
        kneaddb = kneaddata_database + '/human_hg19/hg19'
    elif species == 'mouse':
        kneaddb = kneaddata_database + '/mouse_C57BL/mouse_C57BL_6NJ'
    else:
        pass

    if not os.path.exists(out_path):
        os.makedirs(out_path, exist_ok=True)
        os.system(f'cp {sampleinfo} {out_path}')
    else:
        pass
    ## output
    check_file_path = f'{out_path}/knead_check_file.txt'
    check_file = pd.DataFrame([
        data_path,
        species,
        kneaddb,
        sampleinfo
    ])
    
    check_file.to_csv(check_file_path,
                      header=None, index=None, sep='\t', mode='w')
    return 0
    
class FunctionStatusChecker:
    def __init__(self, func: callable):
        self.func = func
        self.start_time = None
        self.end_time = None
        self.lock = threading.Lock()
    def __call__(self, *args, **kwargs):
        self.start_time = time.time()
        result = self.func(*args, **kwargs)
        self.end_time = time.time()
        return result
    def is_executed(self) -> bool:
        with self.lock:
            return self.start_time is not None and self.end_time is not None

@FunctionStatusChecker
def execCmd(cmd):
    try:
        print("命令{}開始運(yùn)行{}".format(cmd, datetime.datetime.now()))
        time.sleep(5)
        os.system(cmd)
        print("命令{}結(jié)束運(yùn)行{}".format(cmd, datetime.datetime.now()))
    except:
        print("{}\t運(yùn)行失敗".format(cmd))      
        
@FunctionStatusChecker
def RunKnead(out_path:str = None                  ## String: output path for knead procedure)
              ):
    ## set parameters
    knead_check = pd.read_csv(f'{out_path}/knead_check_file.txt', header=None,delimiter='\t')
    data_path = knead_check.at[0,0]
    species = knead_check.at[1,0]
    kneaddb = knead_check.at[2,0]
    sampleinfo = knead_check.at[3,0]
    
    sample_info_dic = {}
    cmds = []
    for line in open(sampleinfo):
        if not line.startswith("sample"):
            line=line.strip().split()
            sample_info_dic[line[0]]=line[1]
            cmd= f'''/home/soft/miniconda3/envs/Rdoc/bin/kneaddata -t 6 -v \
                -i1 {data_path}/{line[0]}_R1.fastq.gz \
                -i2 {data_path}/{line[0]}_R2.fastq.gz \
                -o  {out_path}/{line[0]} -db {kneaddb} \
                --cat-final-output --output-prefix {line[0]}  --serial \
                --trimmomatic /home/soft/Trimmomatic-0.39/ \
                --trimmomatic-options 'SLIDINGWINDOW:4:20 MINLEN:50' \
                --bowtie2 /home/soft/miniconda3/envs/Rdoc/bin/ \
                --bowtie2-options '--very-sensitive --dovetail' \
                --remove-intermediate-output \
                --fastqc /home/soft/miniconda3/envs/Rdoc/bin/ \
                --trf /home/soft/miniconda3/envs/Rdoc/bin/ \
                --run-fastqc-start \
                --run-fastqc-end '''
        cmds.append(cmd)
        threads = []
    for cmd in cmds:
        th = threading.Thread(target=execCmd, args=(cmd,))
        th.start()
        th.join()
        time.sleep(2)
        threads.append(th)
    return out_path
    
@FunctionStatusChecker
def Run_Stats(out_path:str = None                  ## String: output path for knead procedure
             ):
    StatsDir = out_path + '/readsinfo'
    if os.path.exists(StatsDir):
        pass
    else:
        os.mkdir(StatsDir)
    x= [item for item in os.walk(out_path)]
    for path, dirs, files in x:
        for file in files:
            if file.endswith('.log'):
                shutil.copy(path+os.sep+file,StatsDir)
    cmd = f"cd {out_path} && kneaddata_read_count_table --input  readsinfo/ --output kneaddata_read_count_table.tsv && \
    cut -f 1-5,12-13 kneaddata_read_count_table.tsv | sed 's/_R1_kneaddata//;s/pair//g' > kneaddata_report.txt "
    execCmd(cmd)
    
def knead_call(out_path:str = None                  ## String: output path for knead procedure
        ):
    function_checker = RunKnead
    result = function_checker(out_path) 
    cleandata_path = os.path.abspath(out_path)
    sampleinfo = os.path.abspath(out_path)+'/sample.txt'
    ## output
    call_file_path = f'{out_path}/knead_call_file.txt'
    call_file = pd.DataFrame([cleandata_path,sampleinfo])

    call_file.to_csv(call_file_path,
                      header=None, index=None, sep='\t', mode='w')
    
    step1 = function_checker.is_executed()
    if step1 is True:
        function_checker2 = Run_Stats
        result = function_checker2(out_path) 
        #print(result)  
        step2 = function_checker.is_executed()
    else:
        wait()
    return 0

def knead_plot(out_path:str = None                  ## String: output path for knead procedure
              ):
    cmd = f" Rscript {method_path}/knead_plot.R -w {out_path} "
    execCmd(cmd)
    return 0
    
data_path = './RawData/dre'
species= 'mouse'     
out_path ='kneaddout'
knead_check(data_path,refer_path,species,out_path)
knead_call('kneaddout')
knead_plot('kneaddout')

3. 補(bǔ)充內(nèi)容

如果需要的話,我可以提供打包好的宏基因組分析docker鏡像和測試數(shù)據(jù),另外持續(xù)關(guān)注哦,因?yàn)槲視?huì)持續(xù)更新 ?。?!

?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡書系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容