1. 安裝軟件
1.1. 下載安裝MMAPPR
wget http://yostlab.genetics.utah.edu/wp-content/uploads/2019/02/MMAPPR_083.tar.gz
tar -zxvf MMAPPR_083.tar.gz
cd MMAPPR_083
1.2. 下載安裝USeq軟件
wget https://github.com/HuntsmanCancerInstitute/USeq/releases/download/USeq_9.3.1/USeq_9.3.1.zip
unzip USeq_9.3.1.zip
2. 構建參考基因組索引
該索引用于后面Hisat2比對
hisat2-build ./B73_v5.fa B73_v5.fa
3. 利用Hisat2進行序列比對
mkdir 01Cleandata 02Alignment
for i in mut WT
do
#去接頭
java -jar trimmomatic-0.39.jar PE -phred33 -threads 8 ./${i}_1.fq.gz ./${i}_2.fq.gz ./01Cleandata/${i}_1_cleandata.fq.gz ./01Cleandata/${i}_good_unpaired.fq.gz ./01Cleandata/${i}_2_cleandata.fq.gz ./01Cleandata/${i}_2_unpaired.fq.gz LEADING:20 TRAILING:20 SLIDINGWINDOW:4:15 MINLEN:20
#比對到基因組
hisat2 --dta -x B73_v5 -p 8 -1 ./01Cleandata/${i}_cleandata.fq.gz -2 ./01Cleandata/${i}_2_cleandata.fq.gz -S ./02Alignment/${i}.sam
將sam文件轉化為bam文件
samtools view -S ./02Alignment/${i}.sam -b > ./02Alignment/${i}.bam
#為了節(jié)約空間,可以將不用的sam文件刪除
sudo rm -r ./02Alignment/${i}.sam
samtools sort ./02Alignment/${i}.bam -o ./02Alignment/${i}_sorted.bam
done
4. 準備MMAPPR的注釋文件
4.1 將基因組分成不同的染色體
pip install pyfaidx
faidx -x B73_v5.fa
mkdir ${path}/MMAPPR_083/annotation_files/B73_v5
cp chr*.fa ${path}/MMAPPR_083/annotation_files/B73_v5
4.2 給基因組建立索引
該索引是用于MMAPPR軟件調用
samtools faidx 能夠對fasta 序列建立一個后綴為.fai 的文件,根據(jù)這個.fai 文件和原始的fastsa文件, 能夠快速的提取任意區(qū)域的序列
其他提取序列的方法:
samtools faidx input.fa chr1 > chr1.fa
samtools faidx input.fa chr1:100-200 > chr1.fa
samtools faidx B73_v5.fa
cp ./B73_v5.fa ${path}/MMAPPR_083/annotation_files/
cp ./B73_v5.fa.fai ${path}/MMAPPR_083/annotation_files/
4.3 將gtf文件轉化為mammap 可以識別的注釋文件
I. 下載gtfToGenePred
wget -c http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/gtfToGenePred
chmod 777 gtfToGenePred
II. 利用gtfToGenePred 將gtf文件轉化為UCSC refflat 格式
echo "#geneName\tname\tchrom\tstrand\ttxStart\ttxEnd\tcdsStart\tcdsEnd\texonCount\texonStarts\texonEnds" > B73_v5.genes
./gtfToGenePred -genePredExt -geneNameAsName2 B73_v5.gtf -ignoreGroupsWithoutExons /dev/stdout| awk '{print $1, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10}' >> B73_v5.genes
cp ./B73_v5.genes ${path}/MMAPPR_083/annotation_files/
refFlat 格式如下
table refFlat
"A gene prediction with additional geneName field."
(
string geneName; "Name of gene as it appears in Genome Browser."
string name; "Name of gene"
string chrom; "Chromosome name"
char[1] strand; "+ or - for strand"
uint txStart; "Transcription start position"
uint txEnd; "Transcription end position"
uint cdsStart; "Coding region start"
uint cdsEnd; "Coding region end"
uint exonCount; "Number of exons"
uint[exonCount] exonStarts; "Exon start positions"
uint[exonCount] exonEnds; "Exon end positions"
)
5. 運行MMAPPR軟件
cd ${path}/MMAPPR_083/
vim run_mmappr_test.sh
#修改文件內容如下
#! /bin/bash
python3 ./MMAPPR_083.py \
-c ./WT_sorted.bam \ # 經(jīng)過sort之后的野生型BAM文件
-m ./mut_sorted.bam \ # 經(jīng)過sort之后的突變體BAM文件
-g B73_v5 \ # 參考基因組文件
-u /The_path_to_where_you_installed/USeq_9.3.1/ \ #Useq 軟件的安裝路徑
--cores 8 # 軟件運行所用到的核心數(shù)
#保存修改
nohup sh run_mmappr_test.sh &
#運行程序,等待分析結果
MMAPPR 文件夾的數(shù)據(jù)結構,請勿移動,可將文件替換成自己的文件即可
mmappr/
MMAPPR_083.py # Main python script.
MMAPPR_083.R # R script, invoked by the python script.
run_mmappr_test.sh # Sample bash script to run MMAPPR.
# Needs to be customized to run properly.
annotation_files/
zv9.fa # Zv9 (danRer7) reference genome in fasta format.
zv9.fa.fai # fasta index file for zv9.fa
zv9.genes # Refseq annotation of Zv9 genome in UCSC refflat format.
zv9/ # Folder containing Zv9 split into 1 file per chromosome.
bamfiles/
WTmerged6_STP1N60A.bam # Downsampled (6%) .bam file, wild-type.
MUTmerged6_STP1N60A.bam # Downsampled (6%) .bam file, mutant.
運行結果會在snpmapresults文件夾中顯示,包括snp信息等
6. 對于軟件的其他修改
若得到的區(qū)間不符合預期,或者未能給出預測的區(qū)間,則需要更改MAPPR_083.py文件中的power值
more MAPPR_083.py
顯示結果如下:
# MMAPPR version 0.83. Copyright 2012-2013, University of Utah.
#===============================================================================
# MMAPPR: Mutation Mapping Analysis Pipeline for Pooled RNA-seq
# Jonathon Hill, Bradley Demarest, Brent Bisgrove, Bushra Gorsi, H. Joseph Yost
# Department of Neurobiology and Anatomy, University of Utah
#===============================================================================
import argparse
import os
import os.path
import re
import subprocess
import sys
import threading
import time
tlast = "0"
def log(outputstring, stamp=True):
'Allows outputs to be printed to Stdout and log file'
outputstring = outputstring.rstrip()
global tlast
t = time.strftime("[%d %b %Y %H:%M:%S] ")
if t == tlast or stamp == False:
tstamp = " " * 23
else:
tstamp = t
print(tstamp+outputstring)
logfile.write(tstamp+outputstring+'\n')
tlast = t
sys.stdout.flush()
def sout(soutfile):
o_file.write('chr\tpos\tref\tcov\tA\tC\tG\tT\n')
#compile regs
test = re.compile('[ACGTacgt]')
starts = re.compile('\^.')
ends = re.compile('[\$]')
indels = re.compile('[\+-](\d+)')
for line in soutfile:
line = line.decode(encoding="utf-8")
if len(line) != 0:
line = line.strip()
linearray = line.split() #0=chr 1=pos 2=ref base 3=coverage 4=read bases 5=base quality
if len(linearray) != 6:
log("Skipped invalid line: "+line)
continue
if (args.mask == True and linearray[2].islower()) or line[0] == '#' or linearray[1] == '*' or not test.search(linearray[4]):
continue
linearray[4] = starts.sub('',linearray[4]) #remove start of read chars
linearray[4] = ends.sub('',linearray[4]) #remove end of read character and gap character
for match in indels.findall(linearray[4]): #remove indel information (in future will modify to map indels as well)
pattern = '[\+-]'+match+'.{'+match+'}'
linearray[4] = re.sub(pattern, '', linearray[4])
i = 0
reads = ''
if len(linearray[4]) != len(linearray[5]):
log("Skipped invalid line: "+line)
continue
for qual in linearray[5]: #remove low quality bases
if ord(qual) - 33 >= args.base_qual and linearray[4][i] != "<" and linearray[4][i] != ">":
reads = reads+linearray[4][i]
i += 1
if reads.count('.')+reads.count(',') != len(reads):
trantab = str.maketrans('.,acgt', linearray[2].upper()+linearray[2].upper()+'ACGT') #translate everything to forward sequence bases
reads=reads.translate(trantab)
basecount = {}
basecount['A'] = reads.count('A') #calculate base occurences
basecount['C'] = reads.count('C')
basecount['G'] = reads.count('G')
basecount['T'] = reads.count('T')
cov = str(basecount['A']+basecount['C']+basecount['G']+basecount['T'])
o_file.write('\t'.join([linearray[0],linearray[1],linearray[2],cov,str(basecount['A']),str(basecount['C']),str(basecount['G']),str(basecount['T'])])+'\n')
print(" "*100+'\t'.join(['\r',linearray[0],linearray[1],linearray[2],cov,str(basecount['A']),str(basecount['C']),str(basecount['G']),str(basecount['T'])]), end="\r")
print(" "*100, end='\r')
def serr(serrfile, source=""):
for err in serrfile:
log(source+err.decode(encoding="utf-8"))
def mpileup(inf):
command = ' '.join(['samtools mpileup', '-q', str(args.map_qual), '-f', os.path.abspath(fasta), inf])
log("Mpileup command: "+command)
mp = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=-1)
t1=threading.Thread(target=sout, args=(mp.stdout,))
t2=threading.Thread(target=serr, args=(mp.stderr,))
t1.start()
t2.start()
mp.wait()
t1.join()
t2.join()
exitCode = mp.returncode
if (exitCode == 0):
return "Conversion complete"
else:
raise Exception(inf, exitCode)
def rscript():
scriptpath = os.path.dirname(os.path.realpath(__file__))+"/MMAPPR_083.R"
command = ' '.join(["Rscript --vanilla", scriptpath, outfilec, outfilem, str(args.power), "aicc", str(args.min_depth), str(args.cores), str(args.chrprefix)])
log("Rscript command: "+command)
rp = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
serr(rp.stdout, "R: ")
rp.wait()
exitCode = rp.returncode
if (exitCode == 0):
return "Rscript complete"
else:
raise Exception(exitCode)
def alleler():
scriptpath = os.path.abspath(os.path.join(args.u, "Apps/Alleler"))
command = ' '.join(['java -Xmx5G -jar', scriptpath,
'-a snpmapresults/putativesnps.txt -g', os.path.abspath(fastafolder), '-u', os.path.abspath(annotation), '-c -d'])
log("Alleler command: "+command)
al = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
serr(al.stdout, "Alleler: ")
al.wait()
exitCode = al.returncode
if (exitCode == 0):
return "Alleler complete"
else:
raise Exception(exitCode)
####################
dirmade = "False"
if not os.path.exists("snpmapresults/"):
os.makedirs("snpmapresults/")
dirmade = "True"
parser = argparse.ArgumentParser(description='MMAPPR version 0.82\nFor complete documentation, see website at')
parser.add_argument('-c', required=True, help='bam or sam file for control pool (required)')
parser.add_argument('-m', required=True, help='bam or sam file for mutant pool (required)')
parser.add_argument('-g', required=True, help='genome prefix (required). Corresponding files for gene and genomic annotations should be in the "annotation_files" folder (required)')
parser.add_argument('-u', required=True, help='full path to USeq directory (required)')
parser.add_argument('--log', default="snpmap.log", help='log file name (default="MMAPPR.log"')
parser.add_argument('--map_qual', default=30, type=int, help='minimum map quality score (default = 30)')
parser.add_argument('--base_qual', default=20, type=int, help='minimum base quality score (default = 20)')
parser.add_argument('--min_depth', default=10, type=int, help='minimum read depth at snp (default = 10)')
parser.add_argument('--mask', action='store_true', help='ignores lines with lowercase letters in FASTA reference file. UCSC uses these to mark repetitive regions.')
parser.add_argument('--power', default=4, type=int, help='power to raise euclidean distances to. (default = 4)')
parser.add_argument('--cores', default=1, type=int, help='number of cores to use (default = 1)')
parser.add_argument('--chrprefix', default='chr', help='chromosome prefix (default = "chr".')
args = parser.parse_args()
params = re.match('Namespace\((.*)\)', str(args))
logfile = open("snpmapresults/"+args.log, 'a')
log("Starting MMAPPR version 0.83")
log("Args: "+params.group(1))
annotation = os.path.dirname(os.path.realpath(__file__))+"/annotation_files/"+args.g+".genes"
fasta = os.path.dirname(os.path.realpath(__file__))+"/annotation_files/"+args.g+".fa"
fastafolder = os.path.dirname(os.path.realpath(__file__))+"/annotation_files/"+args.g
#-------------------------------------------------------------------------------
# Check dependencies.
log('Python version found: ' + sys.version.split('\n')[0])
# Check that R and samtools are on the PATH and display version.
samversion = re.split("\n", subprocess.getoutput("samtools")) #cannot get status to check if exists because samtools always returns error if not running command
rversion = re.split("\n", subprocess.getoutput("Rscript --version"))
samstat = False
rstat = False
for line in samversion:
if 'Version' in line:
log('Samtools found: ' + line)
samstat = True
for line in rversion:
if 'version' in line:
log('Rscript found: ' + line)
rstat = True
if not samstat:
log('Error: Samtools not found.')
if not rstat:
log('Error: Rscript not found.')
# Check that R package 'parallel' is present.
parallel_check = subprocess.getoutput('''echo "res = require(parallel,
quietly=TRUE); if (res) cat('parallel TRUE\n')" | Rscript -''')
parallelstat = (parallel_check == 'parallel TRUE')
if not parallelstat:
log('''Error: R cannot load the 'parallel' package. Install R version 2.14 or higher.''')
# Check that USeq Alleler jar is callable.
useq_path = os.path.abspath(os.path.join(args.u, 'Apps/Alleler'))
useq_check = subprocess.getstatusoutput('java -Xmx4g -jar ' + useq_path)
useqstat = not useq_check[0]
if useqstat:
log('USeq found: ' + useq_path)
else:
log('Error: USeq not found.' + useq_path)
# Check that user specified files are readable.
ufiles = [args.c, args.m, annotation, fasta, fastafolder]
file_access = [os.access(os.path.abspath(ufile), os.R_OK) for ufile in ufiles]
for ufile, filestat in zip(ufiles, file_access):
check_file = log(os.path.abspath(ufile) + (' -- Found!\n' if filestat else
' -- Error: File not found.\n'))
# Exit MMAPPR if any files or dependencies are not found.
check_all = [samstat, rstat, parallelstat] + file_access
if not all(check_all):
log('MMAPPR is quitting...')
sys.exit()
#-------------------------------------------------------------------------------
if dirmade == "False":
log("Snpmap results directory found. Checking if snp files present.")
basenamec = re.match('(.*)/(.*?)\.(.*?)', args.c)
outfilec = "snpmapresults/"+basenamec.group(2)+'mapq'+str(args.map_qual)+'baq'+str(args.base_qual)+'.snps'
basenamem = re.match('(.*)/(.*?)\.(.*?)', args.m)
outfilem = "snpmapresults/"+basenamem.group(2)+'mapq'+str(args.map_qual)+'baq'+str(args.base_qual)+'.snps'
if os.path.isfile(outfilec):
log("Snp file for control found. Skipping conversion step.")
else:
o_file = open(outfilec, 'w')
log("Converting "+args.c+" to snp file")
log(mpileup(os.path.abspath(args.c)))
o_file.close()
if os.path.isfile(outfilem):
log("Snp file for mutant found. Skipping conversion step.")
else:
o_file = open(outfilem, 'w')
log("Converting "+args.m+" to snp file")
log(mpileup(os.path.abspath(args.m)))
o_file.close()
log("Running Rscript to find candidates")
log(rscript())
log("Running Alleler to find functional effect of candidates")
log(alleler())
第157行顯示為
parser.add_argument('--power', default=4, type=int, help='power to raise euclidean distances to. (default = 4)')
我們在run_mmappr_test. sh 文件,最后添加一行 -- power = 10 或者其他合適的數(shù)值即可
同時第147-158 為可以修改的參數(shù),可以根據(jù)自己的需求進行修改
該數(shù)值可以嘗試多次修改,每次修改后重新運行程序即可,程序會自動跳過Call snp的步驟,直接根據(jù)SNP信息預測區(qū)間,所用時間會比第一次操作時較短
部分內容來源網(wǎng)絡,如侵權,聯(lián)系刪除。
個人學習筆記,如有錯誤,不吝賜教