基因組實戰(zhàn)03: WGS toy example

借鑒Reference中第2、3篇文章的代碼。分析的數(shù)據(jù)是大腸桿菌,因為基因組小,適合拿來快速跑通整個流程


00 下載fastq數(shù)據(jù)


mkdir -p ~/Project/DNA/raw

cd ~/Project/DNA/raw

wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR177/003/SRR1770413/SRR1770413_1.fastq.gz

wget ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR177/003/SRR1770413/SRR1770413_2.fastq.gz


01 filter the bad quality reads and remove adaptors

腳本

#!/bin/bash

#clean.sh

# python3.10/python2.7

# fastqc multiqc trim_galore


HOME_DIR=~/Project/DNA

FASTQ_DIR=${HOME_DIR}/raw

CLEAN_DIR=${HOME_DIR}/clean

N_JOBS=12


if? [ ! -d $CLEAN_DIR ]

then

? ? mkdir -p $CLEAN_DIR

fi


micromamba activate dna2


# 1.QC

cd $FASTQ_DIR

fastqc --thread $N_JOBS *fastq.gz && multiqc .


# 2.remove adapter

micromamba activate dna3

for FILE in `ls $FASTQ_DIR/*_1.fastq.gz`

do

? ? SAMPLE=`basename $FILE | sed s/_1\.fastq\.gz//`

? ? echo "Processing $SAMPLE"


? ? F1=$FASTQ_DIR/$SAMPLE"_1.fastq.gz"

? ? F2=$FASTQ_DIR/$SAMPLE"_2.fastq.gz"


? ? trim_galore \

? ? ? ? --quality 30 \

? ? ? ? --length 50 \

? ? ? ? --output_dir $CLEAN_DIR \

? ? ? ? --cores $N_JOBS \

? ? ? ? --paired \

? ? ? ? --fastqc \

? ? ? ? -e 0.1 \

? ? ? ? --trim-n \

? ? ? ? $F1 $F2


? ? mv $CLEAN_DIR/${SAMPLE}"_1_val_1.fq.gz" \

? ? ? ? $CLEAN_DIR/${SAMPLE}"_1.fastq.gz"


? ? mv $CLEAN_DIR/${SAMPLE}"_2_val_2.fq.gz" \

? ? ? ? $CLEAN_DIR/${SAMPLE}"_2.fastq.gz"


? ? mv $CLEAN_DIR/${SAMPLE}"_1_val_1_fastqc.html" \

? ? ? ? $CLEAN_DIR/${SAMPLE}"_1_fastqc.html"


? ? mv $CLEAN_DIR/${SAMPLE}"_1_val_1_fastqc.zip" \

? ? ? ? $CLEAN_DIR/${SAMPLE}"_1_fastqc.zip"


? ? mv $CLEAN_DIR/${SAMPLE}"_2_val_2_fastqc.html" \

? ? ? ? $CLEAN_DIR/${SAMPLE}"_2_fastqc.html"


? ? mv $CLEAN_DIR/${SAMPLE}"_2_val_2_fastqc.zip" \

? ? ? ? $CLEAN_DIR/${SAMPLE}"_2_fastqc.zip"

done


# 3.QC for clean

cd $CLEAN_DIR

micromamba activate dna2

multiqc .


運行

source clean.sh &> clean.sh.log &

02 align

下載參考基因組數(shù)據(jù)

curl -OJX GET "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCF_000005845.2/download?include_annotation_type=GENOME_FASTA,GENOME_GFF,RNA_FASTA,CDS_FASTA,PROT_FASTA,SEQUENCE_REPORT&filename=GCF_000005845.2.zip" -H "Accept: application/zip"

unzip GCF_000005845.2.zip

cp ncbi_dataset/data/GCF_000005845.2/GCF_000005845.2_ASM584v2_genomic.fna genome/E.coli.fa

生成的文件索引

#!/bin/bash

#python2.7

HOME_DIR=~/Project/DNA

GENOME_FILE=E.coli.fa

INDEX_DIR=$HOME_DIR/genome

micromamba activate dna2

echo "----------------- BWA INDEX START-----------------"

bwa index $INDEX_DIR/$GENOME_FILE && \

echo "----------------- BWA INDEX DONE -----------------"


echo "----------------- CreateSequenceDictionary START -----------------"

gatk CreateSequenceDictionary \

? ? --REFERENCE $INDEX_DIR/$GENOME_FILE \

? ? --OUTPUT $INDEX_DIR/E.coli.dict && echo "** dict done **"? && \

echo "----------------- CreateSequenceDictionary DONE -----------------"


samtools faidx $INDEX_DIR/$GENOME_FILE

source index.sh &> index.sh.log &

對比



-R:Read group;


ID:Read group identifier,必須唯一,一般為樣本名;


SM:Sample,樣本名;


LB:Library,測序文庫;


PL:Platform,測序平臺




#!/bin/bash

#python2.7

micromamba activate dna2

HOME_DIR=~/Project/DNA

BWA_INDEX=$HOME_DIR/genome/E.coli.fa

CLEAN_DIR=$HOME_DIR/clean

ALIGN_DIR=$HOME_DIR/align

N_JOBS=12


if? [ ! -d $ALIGN_DIR ]

then

? ? mkdir -p $ALIGN_DIR

fi


for FILE in `ls $CLEAN_DIR/*_1.fastq.gz`

do

? ? SAMPLE=`basename $FILE | sed s/_1\.fastq\.gz//`


? ? echo "----------------- BWA ${SAMPLE} START -----------------"

? ? F1=$CLEAN_DIR/$SAMPLE"_1.fastq.gz"

? ? F2=$CLEAN_DIR/$SAMPLE"_2.fastq.gz"

? ? RG="@RG\tID:"$SAMPLE"\tSM:"$SAMPLE"\tLB:WGS\tPL:ILLUMINA"

? ? bwa mem -t $N_JOBS -R $RG $BWA_INDEX $F1 $F2 | \

? ? ? ? samtools sort --threads $N_JOBS -O bam -o $ALIGN_DIR/$SAMPLE".bam" - && \

? ? echo? "----------------- BWA ${SAMPLE} DONE -----------------"


? ? echo "----------------- MarkDuplicates ${SAMPLE} START-----------------"

? ? gatk MarkDuplicates \

? ? ? ? --INPUT ${ALIGN_DIR}/${SAMPLE}".bam" \

? ? ? ? --OUTPUT ${ALIGN_DIR}/${SAMPLE}".mark.bam" \

? ? ? ? --METRICS_FILE ${ALIGN_DIR}/${SAMPLE}".mark.matrics" && \

? ? echo? "----------------- MarkDuplicates ${SAMPLE} DONE -----------------" &&

? ? rm -f ${ALIGN_DIR}/${SAMPLE}".bam"


? ? echo "-----------------Samtools Indexing ${SAMPLE} START -----------------"

? ? samtools index ${ALIGN_DIR}/${SAMPLE}".mark.bam"? && \

? ? echo? "----------------- Samtools Indexing ${SAMPLE} DONE -----------------"

done

source bwa.sh &> bwa.sh.log &

03 call variants

#!/bin/bash

micromamba activate dna2

HOME_DIR=~/Project/DNA

BWA_INDEX=$HOME_DIR/genome/E.coli.fa

ALIGN_DIR=$HOME_DIR/align

MUTATION=$HOME_DIR/mutation

N_JOBS=12


if? [ ! -d $MUTATION ]

then

? ? mkdir -p $MUTATION

fi


# 1 每個樣本生成一個gvcf文件

for FILE in `ls ${ALIGN_DIR}/*.mark.bam`

do

? ? SAMPLE=`basename $FILE | sed s/\.mark\.bam//`

? ? echo $SAMPLE


? ? gatk HaplotypeCaller \

? ? --reference $BWA_INDEX \

? ? --emit-ref-confidence GVCF \

? ? --input $ALIGN_DIR/$SAMPLE".mark.bam" \

? ? --output $MUTATION/$SAMPLE".gvcf" && echo "** gvcf done **"

done


# 2 通過gvcf檢測變異

gatk GenotypeGVCFs \

--reference $BWA_INDEX \

--variant $MUTATION/$SAMPLE".gvcf" \

--output $MUTATION/final.vcf && echo "** vcf done **"


# 3 壓縮 構(gòu)建tabix索引

bgzip -f $MUTATION/final.vcf

tabix -p vcf $MUTATION/final.vcf.gz


# 4.過濾(硬指標)


## 使用SelectVariants,選出SNP

gatk SelectVariants \

? ? -select-type SNP \

? ? --variant $MUTATION/final.vcf.gz \

? ? --output $MUTATION/final.snp.vcf.gz


## 為SNP作硬過濾

gatk VariantFiltration \

? ? --variant $MUTATION/final.snp.vcf.gz \

? ? --filter-expression "QD < 2.0 || MQ < 40.0 || FS > 60.0 || SOR > 3.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0" \

? ? --filter-name "PASS" \

? ? --output $MUTATION/final.snp.filter.vcf.gz


## 使用SelectVariants,選出Indel

gatk SelectVariants \

? ? -select-type INDEL \

? ? --variant $MUTATION/final.vcf.gz \

? ? --output $MUTATION/final.indel.vcf.gz


## 為Indel作過濾

gatk VariantFiltration \

? ? --variant $MUTATION/final.indel.vcf.gz \

? ? --filter-expression "QD < 2.0 || FS > 200.0 || SOR > 10.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0" \

? ? --filter-name "PASS" \

? ? --output $MUTATION/final.indel.filter.vcf.gz


# 重新合并過濾后的SNP和Indel

gatk MergeVcfs \

? ? --INPUT $MUTATION/final.snp.filter.vcf.gz \

? ? --INPUT $MUTATION/final.indel.filter.vcf.gz \

? ? --OUTPUT $MUTATION/final.filter.vcf.gz


cd $MUTATION

rm -f *snp* *indel* final.vcf* *gvcf*

source call_variant.sh &> call_variant.sh.log &

Reference

# gatk api

https://gatk.broadinstitute.org/hc/en-us/articles/13832655155099--Tool-Documentation-Index

#GATK4.0和全基因組數(shù)據(jù)分析實踐?

https://mp.weixin.qq.com/s/Heu-jmJeM3EQy2PmyA-gyQ

https://mp.weixin.qq.com/s/ooU7Tuh0adGNKEISELFjUA

#GATK best practices

https://mp.weixin.qq.com/s/GvuL8NlFSkQ6MVdghlEYUQ

# trim-galore

https://mp.weixin.qq.com/s/n3G_qot5mhB3ZR1gcDnV6g

# 方法分享 | 全外顯子測序分析的標準流程

https://mp.weixin.qq.com/s/WOeLHXIomhNjSFXjXZ8zcg

https://mp.weixin.qq.com/s/WOeLHXIomhNjSFXjXZ8zcg

# GATK4全基因組數(shù)據(jù)分析最佳實踐?

https://mp.weixin.qq.com/s/r3sghKtEKq1FnjQ05WCcnA

# 第1節(jié) 測序技術(shù)?

https://mp.weixin.qq.com/s/kq2i5tNZmm9GKGAx2Day-g

#第2節(jié) FASTA和FASTQ?

https://mp.weixin.qq.com/s/0h5B3T6RKTHTsZBuoZvtgQ

#第3節(jié) 數(shù)據(jù)質(zhì)控?

https://mp.weixin.qq.com/s/8hY0U48kiVTH6Yx4JBcAhg#

#第4節(jié) 構(gòu)建WGS主流程?

https://mp.weixin.qq.com/s/AT2oodvdqWgbj1gvVo1hWQ

#第5節(jié) 理解并操作BAM文件?

https://mp.weixin.qq.com/s/UnMyAuUHmK7DMGo8h88oQw


# WGS(全基因組測序)/WES(全外顯子組測序)/WGRS(全基因組重測序)分析與可視化教程

https://zhuanlan.zhihu.com/p/380684876


https://blog.csdn.net/bio_meimei/article/details/108236406


https://zhuanlan.zhihu.com/p/422365954


https://zhuanlan.zhihu.com/p/69726572

https://hpc.nih.gov/training/gatk_tutorial/haplotype-caller.html

?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時請結(jié)合常識與多方信息審慎甄別。
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容