全長(zhǎng)轉(zhuǎn)錄組分析

sample1

  • sample1基本測(cè)序信息統(tǒng)計(jì)
1.統(tǒng)計(jì)reads數(shù)
samtools view m64082_210415_203420.subreads.1--1.bam| wc -l
16394742                 #sample1 reads number
2.統(tǒng)計(jì)bases數(shù)
samtools view m64082_210415_203420.subreads.1--1.bam| cut -f10 |wc -c
51688834095
51688834095-16394742=51672439353  #sample1 總bases數(shù)
3.統(tǒng)計(jì)reads長(zhǎng)度分布
samtools view m64082_210415_203420.subreads.1--1.bam | cut -f10 |  awk '{print length($1);}'  | sort -n -r > sample1.subreads.length
4.計(jì)算N50
samtools fasta m64082_210415_203420.subreads.1--1.bam >m64082_210415_203420.subreads.1--1.fasta
perl ~/soft/N50_N90.pl m64082_210415_203420.subreads.1--1.fasta >sample1.subreads.N50 &
N50: 3465
  • 畫頻數(shù)分布直方圖
setwd("C:/Users/TE/Desktop/全長(zhǎng)轉(zhuǎn)錄組/紹梅師姐/sample1/subreads")
a<-read.table(file = "sample1.subreads.length")
a$V1<-as.integer(a$V1) 
library(ggplot2)
ggplot(data=a,aes(x=V1)) + 
geom_histogram(binwidth=200,fill="#6e6dcd", color="#e9ecef", alpha=0.9)+
theme_bw()+ scale_x_continuous(name="Read Length",limits=c(0, 20000))+
scale_y_continuous(name="Reads",limits=c(0, 1500000))
ggsave("sample1.subreads.length.tiff",height = 4,width = 8)
  • 獲取一致性序列CCS
nohup  /public/jychu/soft/smrtlink/smrtcmds/bin/ccs --min-passes 2 --min-length 50 --max-length 15000 --min-rq 0.9  --num-threads 20  m64082_210415_203420.subreads.1--1.bam  sample0.ccs.bam  &
  • 統(tǒng)計(jì)CCS信息
1.統(tǒng)計(jì)reads數(shù)
samtools view sample1.ccs.bam | wc -l
628188    #reads number
2.統(tǒng)計(jì)bases數(shù)
samtools view sample1.ccs.bam | cut -f10 |wc -c
2205667668
2205667668-628188=2205039480     #all bases
3.統(tǒng)計(jì)序列長(zhǎng)度、FP、質(zhì)量值
samtools view sample1.ccs.bam | cut -f10 |  awk '{print length($1);}' | sort -n -r > sample1.ccs.length
samtools view sample1.ccs.bam |cut -f16 | tr ":" "\t" | cut -f3 > sample1.Fullpasses
samtools view sample1.ccs.bam |cut -f17 | tr ":" "\t" | cut -f3 > sample1.Quality
cat sample1.Quality| awk '{sum+=$1} END {print "Avg =", sum/NR}'
Avg = 0.995584
cat sample1.Fullpasses| awk '{sum+=$1} END {print "Avg =", sum/NR}'
Avg = 22.9421
  • 畫頻數(shù)分布直方圖
setwd("C:/Users/TE/Desktop/全長(zhǎng)轉(zhuǎn)錄組/紹梅師姐/sample1/ccs")
a<-read.table(file = "sample1.ccs.length")
a$V1<-as.integer(a$V1) 
library(ggplot2)
ggplot(data=a,aes(x=V1)) + 
geom_histogram(binwidth=200,fill="#6498C8", color="#e9ecef", alpha=0.9)+
theme_bw()+ scale_x_continuous(name="Read Length",limits=c(0, 15000))+
scale_y_continuous(name="Reads",limits=c(0, 50000))
ggsave("sample1.ccs.length.tiff",height = 4,width = 8)
rm(list=ls())
setwd("C:/Users/TE/Desktop/全長(zhǎng)轉(zhuǎn)錄組/紹梅師姐/sample1/ccs")
a<-read.table(file = "sample1.Quality")
library(ggplot2)
ggplot(data=a,aes(x=V1)) + 
geom_histogram(binwidth=0.0008,fill="#87C966", color="#e9ecef", alpha=0.9)+
theme_bw()+ scale_x_continuous(name="Quality",limits=c(0.9,1))+
scale_y_continuous(name="Reads",limits=c(0,30000))
ggsave("sample1.ccs.quality.tiff",height = 4,width = 8)
rm(list=ls())
setwd("C:/Users/TE/Desktop/全長(zhǎng)轉(zhuǎn)錄組/紹梅師姐/sample1/ccs")
a<-read.table(file = "sample1.Fullpasses")
a$V1<-as.integer(a$V1) 
library(ggplot2)
ggplot(data=a,aes(x=V1)) + 
geom_histogram(binwidth=2,fill="#F4AB58", color="#e9ecef", alpha=0.9)+
theme_bw()+ scale_x_continuous(name="Number of Passes",limits=c(0, 200))+
scale_y_continuous(name="Reads",limits=c(0,60000))
ggsave("sample1.ccs.Fullpasses.tiff",height = 4,width = 8)
  • CCS分類
1.統(tǒng)計(jì)含有5p引物序列數(shù)
vi 5p.fasta
>primer_5p
GCAATGAAGTCGCAGGGTTGGG
 /public/jychu/soft/smrtlink/smrtcmds/bin/lima  --peek-guess -j 20 sample1.ccs.bam 5p.fasta sample1.5p.bam
samtools view sample1.5p.bam| wc -l
563560       #Number of five-five prime reads
rm -f sample1.5p.*
2.統(tǒng)計(jì)含有3p引物序列數(shù)
vi 3p.fasta
>primer_3p
AAGCAGTGGTATCAACGCAGAGTAC
/public/jychu/soft/smrtlink/smrtcmds/bin/lima  --peek-guess -j 20 sample1.ccs.bam 3p.fasta sample1.3p.bam
samtools view sample1.3p.bam| wc -l
615510          #Number of three-three prime reads
rm -f sample1.3p.*
3.統(tǒng)計(jì)同時(shí)含有3p和5p的序列數(shù)
vi IsoSeqPrimers.fasta
>primer_5p
GCAATGAAGTCGCAGGGTTGGG
>primer_3p
AAGCAGTGGTATCAACGCAGAGTAC
/public/jychu/soft/smrtlink/smrtcmds/bin/lima --isoseq --dump-clips --peek-guess -j 20 sample1.ccs.bam IsoSeqPrimers.fasta sample1.fl.bam &
samtools view sample1.fl.primer_5p--primer_3p.bam |  wc -l
540612                     #Number of five-three prime reads
mv  sample1.fl.* fl/
  • isoseq3 refine(去除ployA和嵌合序列)
1.去除ployA和嵌合序列
/public/jychu/soft/smrtlink/smrtcmds/bin/isoseq3 refine --require-polya --min-polya-length 20 -j 4 sample1.fl.primer_5p--primer_3p.bam IsoSeqPrimers.fasta sample1.flnc.bam
mkdir flnc
mv sample1.flnc.* flnc/
2.統(tǒng)計(jì)全長(zhǎng)非嵌合序列數(shù)
samtools view sample1.flnc.bam |wc -l
539221          #Number of full-length non-chimeric reads
3.統(tǒng)計(jì)序列長(zhǎng)度
samtools view sample1.flnc.bam | cut -f10 |  awk '{print length($1);}' | sort -n -r > sample1.flnc.length
cat sample1.flnc.length| awk '{sum+=$1} END {print "Avg =", sum/NR}'
Avg = 3413.56       #Mean length of flnc
4.統(tǒng)計(jì)reads長(zhǎng)度密度分布
把sample1.flnc.length下載到文件夾C:/Users/TE/Desktop/全長(zhǎng)轉(zhuǎn)錄組/紹梅師姐/sample1/flnc  
setwd("C:/Users/TE/Desktop/全長(zhǎng)轉(zhuǎn)錄組/紹梅師姐/sample1/flnc")
a<-read.table(file = "sample1.flnc.length")
a$V1<-as.integer(a$V1) 
library(ggplot2)
ggplot(data=a,aes(x=V1)) + 
geom_density(fill="#fdd5d3", color="#e06666", alpha=0.9,size=0.4,linetype="solid")+       #size指外部線條大小
theme_bw()+ scale_x_continuous(name="Sequence length(bp)",breaks =seq(0,12000,2000),limits=c(0, 12000))+
theme(plot.title = element_text(size = 15,face = "bold", vjust = 0.5, hjust = 0.5),panel.background = element_rect(fill = "transparent",colour = NA), panel.grid.minor = element_blank(), panel.grid.major = element_blank(),plot.background = element_rect(fill = "transparent",colour = NA))
ggsave("sample1.flnc.length.tiff",height = 4,width = 6)
  • FLNC聚類
1.序列聚類
mkdir clustered
/public/jychu/soft/smrtlink/smrtcmds/bin/isoseq3 cluster sample1.flnc.bamclustered/sample1.clustered.bam -j 20 --verbose --use-qvs
cd clustered
2.統(tǒng)計(jì)一致性轉(zhuǎn)錄本數(shù)目
samtools view sample1.clustered.bam | wc -l
46314                #Number of consensus isoforms(一致轉(zhuǎn)錄本)
samtools view sample1.clustered.hq.bam | wc -l
45663               #Number of high-quality isoforms
samtools view sample1.clustered.lq.bam | wc -l
651                   #Number of polished low-quality isoforms
3.統(tǒng)計(jì)一致性轉(zhuǎn)錄本序列長(zhǎng)度
samtools view sample1.clustered.bam | cut -f10 |  awk '{print length($1);}' | sort -n -r > sample1.clustered.length
4.統(tǒng)計(jì)一致性轉(zhuǎn)錄本reads長(zhǎng)度密度分布   
下載sample1.clustered.length到目錄C:/Users/TE/Desktop/全長(zhǎng)轉(zhuǎn)錄組/紹梅師姐/sample1/clustered
統(tǒng)計(jì)一致性轉(zhuǎn)錄本reads長(zhǎng)度密度分布   
setwd("C:/Users/TE/Desktop/全長(zhǎng)轉(zhuǎn)錄組/紹梅師姐/sample1/clustered")
a<-read.table(file = "sample1.clustered.length")
a$V1<-as.integer(a$V1) 
library(ggplot2)
ggplot(data=a,aes(x=V1)) + 
geom_density(fill="#fdd5d3", color="#e06666", alpha=0.9,size=0.4,linetype="solid")+       #size指外部線條大小
theme_bw()+ scale_x_continuous(name="Sequence length(bp)",breaks =seq(0,12000,2000),limits=c(0, 12000))+
theme(plot.title = element_text(size = 25,face = "bold", vjust = 0.5, hjust = 0.5),panel.background = element_rect(fill = "transparent",colour = NA), panel.grid.minor = element_blank(), panel.grid.major = element_blank(),plot.background = element_rect(fill = "transparent",colour = NA))
ggsave("sample1.clustered.length.tiff",height = 4,width = 6)
  • 轉(zhuǎn)錄本去冗余(CD-hit使用 )
cd /public/jychu/sandai-goose/sample1/ccs/fl/flnc/clustered
mkdir uncompare
cp sample1.clustered.hq.fasta.gz uncompare/
cd uncompare
gzip -d sample1.clustered.hq.fasta.gz
/public/jychu/soft/cd-hit-v4.8.1-2019-0228/cd-hit  -i sample1.clustered.hq.fasta -o sample1.fasta -c 0.99 -T 30 -d 40   -M 16000 -U 10 -p 1 -G 1 -s 0.99
45013  finished      45013  clusters非冗余的高質(zhì)量轉(zhuǎn)錄本
  • CDS預(yù)測(cè)(TransDecoder 軟件)
#預(yù)測(cè)轉(zhuǎn)錄本中長(zhǎng)的開放閱讀框, 默認(rèn)是100個(gè)氨基酸,可以用-m修改
cd  /public/jychu/sandai-goose/sample1/ccs/fl/flnc/clustered/uncompare
mkdir CDS
cp sample1.fasta CDS/
cd CDS
~/soft/TransDecoder-v5.5.0/TransDecoder.LongOrfs -t sample1.fasta
#使用DIAMOND對(duì)上一步輸出的longest_orfs.pep在蛋白數(shù)據(jù)庫(kù)中進(jìn)行搜索,尋找同源證據(jù)支持
1. 使用uniprot蛋白數(shù)據(jù)庫(kù)-BLASTP比對(duì)
cd /public/jychu/sandai-goose/sample1/ccs/fl/flnc/clustered/uncompare/CDS/sample1.fasta.transdecoder_dir
~/soft/diamond/diamond blastp -d ~/database/uniprot_sprot/uniprot_sprot -q  longest_orfs.pep --evalue 1e-5 --sensitive -p 40 --max-target-seqs 1 > blastp.outfmt6
1.1 預(yù)測(cè)可能的編碼區(qū)
~/soft/TransDecoder-v5.5.0/TransDecoder.Predict  -t sample1.fasta --retain_blastp_hits sample0.fasta.transdecoder_dir/blastp.outfmt6 
cat sample1.fasta.transdecoder.bed | cut -f1 | sort | uniq |wc -l
37632                                 #有CDS結(jié)構(gòu)的序列數(shù)
cat sample0.fasta.transdecoder.bed | wc -l
49907                                  #檢測(cè)出CDS區(qū)
修改文件名
mv sample1.fasta.transdecoder.bed sample1.uniport.transdecoder.bed
mv sample1.fasta.transdecoder.cds sample1.uniport.transdecoder.cds
mv sample1.fasta.transdecoder.gff3 sample1.uniport.transdecoder.gff3
mv sample1.fasta.transdecoder.pep sample1.uniport.transdecoder.pep
把50個(gè)堿基一行的序列改為該序列的所有堿基為一行
seqkit  seq  sample1.uniport.transdecoder.cds  -w  0  >  sample1.uniport.cds.fa
下載 sample1.uniport.cds.fa sample1.uniport.transdecoder.gff3 到桌面目錄
==============================================================
1.2 使用自己提取出來(lái)的nr動(dòng)物蛋白數(shù)據(jù)庫(kù)
1.2.1 blastp比對(duì)
cd /public/jychu/sandai-goose/sample1/ccs/fl/flnc/clustered/uncompare/CDS/sample1.fasta.transdecoder_dir
~/soft/diamond/diamond blastp -p 40 -d ~/database/nr/nr.animal -q  longest_orfs.pep --evalue 1e-5  > nr.animal.blastp.outfmt6
sort -k1,1 -k12,12nr nr.animal.blastp.outfmt6 |awk  '!a[$1]++' >nr.animal.blastp.outfmt6.txt
把 nr ID轉(zhuǎn)換成genesymbol  網(wǎng)站(https://biodbnet-abcc.ncifcrf.gov/db/db2db.php)
cat nr.animal.blastp.outfmt6.txt |cut -f2 |sort|uniq >refid.txt
下載到桌面在excel中打開,復(fù)制第一列到ID轉(zhuǎn)換網(wǎng)站(用GeneBank Protein Accession轉(zhuǎn)換)
保存為genebank.txt  上傳至服務(wù)器
在nr-animal.fa中查找refid.txt對(duì)應(yīng)的序列
vi refid.sh
cat refid.txt | while read id;
do
         arr=(${id})
         geneid=${arr[0]}
grep  -w  "$geneid" ~/database/nr/nr.animal.fa >> sample1.nr.protein
done
chmod a+x refid.sh
nohup ./refid.sh &                                            #8個(gè)小時(shí)
cat sample1.nr.protein| sort |uniq |tr ">" "\t"|cut -f2 >sample1_nr.protein                      #提取第一個(gè)注釋到的物種信息
cat sample1_nr.protein|tr " " "\t"|awk '{$1=null;print $0}'|tr "\t" " "|sed  's/^[\t ]*//g'>nr.detil     #把id去掉,并去掉首行的空格
sed -i "1d" genebank.txt        #去掉首行
cat genebank.txt|cut -f1,2 >col2   #取前兩列
paste col2  nr.detil | tr "\t" ":" >sample1.nr.zhushi    #把文本轉(zhuǎn)換成AAB96843:VCL:vinculin [Mus musculus] 的格式
rm -f  col2 nr.detil  nr.animal.blastp.outfmt6  #去掉沒用的文件
cat nr.animal.blastp.outfmt6.txt | cut -f1,2 |tr "\t" " "|tr "." "\t"|cut -f1,2|tr "\t" "."|tr " " "\t"| sort -k2,2 >nr.animal.blastp.outfmt6.txt.2col   #提取nr比對(duì)結(jié)果文件的前兩列,并把.去掉,然后按照第二列的字符排序
cut -f2 nr.animal.blastp.outfmt6.txt.2col >col2      #提取按照字符排序的序列ID
從sample1.nr.zhushi中遍歷col2文件的每一行字符
vi serach.sh
cat col2| while read id;
do
         arr=(${id})
         geneid=${arr[0]}
grep  -w  "$geneid" sample1.nr.zhushi >> nr.animal.blastp.outfmt6.txt.2col.col2
done
chmod a+x serach.sh
./serach.sh
cat nr.animal.blastp.outfmt6.txt.2col|cut -f1 >nr.animal.blastp.outfmt6.txt.2col.col1      #取第一列轉(zhuǎn)錄本ID
paste nr.animal.blastp.outfmt6.txt.2col.col1 nr.animal.blastp.outfmt6.txt.2col.col2 >  sample1.nr.final.zs     #合并轉(zhuǎn)錄本ID和其對(duì)應(yīng)的nr數(shù)據(jù)庫(kù)的注釋信息
1.2.3 預(yù)測(cè)可能的編碼區(qū)
cd ..
~/soft/TransDecoder-v5.5.0/TransDecoder.Predict  -t sample1.fasta --retain_blastp_hits sample1.fasta.transdecoder_dir/sample1.nr.final.zs 
cat sample1.fasta.transdecoder.bed | cut -f1 | sort | uniq |wc -l
37684                                  #有CDS結(jié)構(gòu)的序列數(shù)
cat sample1.fasta.transdecoder.bed | wc -l
50206                                  #檢測(cè)出50206個(gè)CDS區(qū)
修改文件名
mv sample1.fasta.transdecoder.bed sample1.nr.transdecoder.bed
mv sample1.fasta.transdecoder.cds sample1.nr.transdecoder.cds
mv sample1.fasta.transdecoder.gff3 sample1.nr.transdecoder.gff3
mv sample1.fasta.transdecoder.pep sample1.nr.transdecoder.pep
把50個(gè)堿基一行的序列改為該序列的所有堿基為一行
seqkit  seq  sample1.nr.transdecoder.cds  -w  0  >  sample1.nr.cds.fa
下載 sample1.nr.cds.fa sample1.nr.transdecoder.gff3 到桌面目錄
  • SSR-簡(jiǎn)單重復(fù)序列預(yù)測(cè)(MISA)
運(yùn)行前將misa.ini與fasta文件放在一起,輸入的序列存在fasta文件里面,然后運(yùn)行下面的命令
cd /public/jychu/sandai-goose/sample1/ccs/fl/flnc/clustered/uncompare
mkdir SSR
cp sample1.fasta SSR/
cd SSR
cp ~/soft/misa/misa.ini ./
perl ~/soft/misa/misa.pl sample1.fasta
less -S sample1.fasta.statistics
Total number of identified SSRs:                 45532
Number of SSRs present in compound formation:    4520   #復(fù)合衛(wèi)星數(shù)目
wc -l sample1.fasta.misa
41013 sample1.fasta.misa                #該文件包含所有的SSR序列的信息,但是會(huì)把符合復(fù)合微衛(wèi)星序列的序列信息放在一行,所以文件的行數(shù)比總微衛(wèi)星數(shù)目少
41013=45532-4520+1                #+1是因?yàn)榈谝恍惺切惺讟?biāo)題
下載sample1.fasta.misa  sample1.fasta.statistics到桌面
計(jì)算SSR的密度分布(count/MB)
首先統(tǒng)計(jì)sample1的序列總長(zhǎng)度
grep -v ">" sample1.fasta|wc -c
153248411
grep -v ">" sample1.fasta|wc -l
45013
總長(zhǎng)度為153248411-45013=153203398
總Mb為153203398/1000000=153.2
---------------------------------------------------------------------------------------------------------
畫密度分布圖,新建文件ssrdensity.txt
type    denity
C    29.50391645
P1    244.3146214
P2    21.15535248
P3    27.46083551
P4    2.950391645
P5    1.11618799
P6    0.208877285
setwd("C:/Users/TE/Desktop/紹梅師姐/sample1/uncompare/SSR")
data<-read.table(file="ssrdensity.txt",header=T,sep="\t")
library(ggplot2)
data$type=factor(data$type,levels=data$type)
CPCOLS <- c("#002fa7","#00fa9a","#008080","#ba55d3","#f0e68c","#f08080","#90ee90")
p<-ggplot(data=data,aes(x=denity,y=type,fill=type))+geom_bar(stat="identity",width=0.8)+coord_flip()+ scale_fill_manual(values = CPCOLS) + theme_bw() + theme(axis.text=element_text(face = "bold", color="gray50")) +labs(y="Period number",x="SSR count/bases(Mb)",title="SSR Density",size=20)+theme(panel.grid.major=element_blank(),panel.grid.minor= element_blank(),legend.position = "none")+theme(axis.text.x = element_text(size=13,face="bold", hjust = 0.5, vjust = 0.5))+theme(plot.title = element_text(hjust = 0.5,size=18,face="bold"))+ theme(axis.title.x = element_text(size = 15, color = "black", face = "bold", vjust = 0.5, hjust = 0.5))+theme(axis.title.y = element_text(size = 15, color = "black", face = "bold", vjust = 0.5, hjust = 0.5))+theme(axis.text.y = element_text(size=13,face="bold", hjust = 0.5, vjust = 0.5))
ggsave("SSRDensity.tiff", p, width = 14, height=10)
  • 轉(zhuǎn)錄本注釋
cd /public/jychu/sandai-goose/sample1/ccs/fl/flnc/clustered/uncompare
mkdir zhushi
cd zhushi
mkdir nr
mkdir uniport
mkdir kog
#用diamond BLASTX進(jìn)行注釋
1. nr注釋
cd nr
nohup  ~/soft/diamond/diamond blastx --db ~/database/nr/nr.animal -q  ../../sample1.fasta --sensitive -p 40 --evalue 1e-5 -o  sample1_nr_matches_fmt6.txt &
cat sample1_nr_matches_fmt6.txt| cut -f1 | sort |uniq|wc -l
37149   #比對(duì)上的轉(zhuǎn)錄本數(shù)目
sort -k1,1 -k12,12nr sample1_nr_matches_fmt6.txt |awk  '!a[$1]++' >sample1.nr.fmt6   #提取每個(gè)轉(zhuǎn)錄本第12列的最大值的行
rm -f sample1_nr_matches_fmt6.txt     #刪除無(wú)用的文件
把 nr ID轉(zhuǎn)換成genesymbol  網(wǎng)站(https://biodbnet-abcc.ncifcrf.gov/db/db2db.php)
cat sample1.nr.fmt6 |cut -f2 |sort|uniq >refid.txt
下載到桌面在excel中打開,復(fù)制第一列到ID轉(zhuǎn)換網(wǎng)站(分別用GeneBank Protein Accession\RefSeq Protein Accession轉(zhuǎn)換)
保存為genebank.txt  上傳至服務(wù)器
在nr-animal.fa中查找refid.txt對(duì)應(yīng)的序列
vi refid.sh
cat refid.txt | while read id;
do
         arr=(${id})
         geneid=${arr[0]}
grep  -w  "$geneid" ~/database/nr/nr.animal.fa >> sample1.nr.protein
done
chmod a+x refid.sh
nohup ./refid.sh &                                                                                 #6小時(shí)
cat sample1.nr.protein| sort |uniq |tr ">" "\t"|cut -f2 >sample1_nr.protein                      #提取第一個(gè)注釋到的物種信息
cat sample1_nr.protein|tr " " "\t"|awk '{$1=null;print $0}'|tr "\t" " "|sed  's/^[\t ]*//g'>nr.detil     #把id去掉,并去掉首行的空格
sed -i "1d" genebank.txt        #去掉首行
cat genebank.txt|cut -f1,2 >col2   #取前兩列
paste col2  nr.detil | tr "\t" ":" >sample1.nr.zhushi    #把文本轉(zhuǎn)換成AAB96843:VCL:vinculin [Mus musculus] 的格式
rm -f  col2 nr.detil   #去掉沒用的文件
cat sample1.nr.fmt6 | cut -f1,2 |tr "\t" " "|tr "." "\t"|cut -f1,2|tr "\t" "."|tr " " "\t"| sort -k2,2 >nr.animal.blastp.outfmt6.txt.2col   #提取nr比對(duì)結(jié)果文件的前兩列,并把.去掉,然后按照第二列的字符排序
cut -f2 nr.animal.blastp.outfmt6.txt.2col |tr "." "\t" |cut -f1>col2      #提取按照字符排序的序列ID
從sample1.nr.zhushi中遍歷col2文件的每一行字符
vi serach.sh
cat col2| while read id;
do
         arr=(${id})
         geneid=${arr[0]}
grep  -w  "$geneid" sample1.nr.zhushi >> nr.animal.blastp.outfmt6.txt.2col.col2
done
chmod a+x serach.sh
./serach.sh
cat nr.animal.blastp.outfmt6.txt.2col|cut -f1 >nr.animal.blastp.outfmt6.txt.2col.col1      #取第一列轉(zhuǎn)錄本ID
paste nr.animal.blastp.outfmt6.txt.2col.col1 nr.animal.blastp.outfmt6.txt.2col.col2 >  sample1.nr.final.zs     #合并轉(zhuǎn)錄本ID和其對(duì)應(yīng)的nr數(shù)據(jù)庫(kù)的注釋信息,并下載到桌面
cat sample1.nr.final.zs|tr "[" "\t"|sed "s/]//g" |cut -f3|sort|uniq -c                  #查看比對(duì)到的物種的分類,復(fù)制到excel表里按數(shù)值降序
2.Swiss-Prot
cd /public/jychu/sandai-goose/sample1/ccs/fl/flnc/clustered/uncompare/zhuzhi/uniport
~/soft/diamond/diamond blastx -d ~/database/uniprot_sprot/uniprot_sprot  -q  ../../sample1.fasta --sensitive -p 40 --evalue 1e-5 -o  sample1_uniprot_matches_fmt6.txt
sort -k1,1 -k12,12nr sample1_uniprot_matches_fmt6.txt |awk  '!a[$1]++' >sample1.uniport.fmt6
rm -f  sample1_uniprot_matches_fmt6.txt
wc -l sample1.uniport.fmt6
35577     #比對(duì)上的轉(zhuǎn)錄本數(shù)目
3.KEGG   在線比對(duì)參考http://www.itdecent.cn/p/48716fa7321b
grep "K" query.ko.txt|cut -f1 >kegg.zhushi                   18953個(gè)注釋到的轉(zhuǎn)錄本
4. KOG數(shù)據(jù)庫(kù)
cd /public/jychu/sandai-goose/sample1/ccs/fl/flnc/clustered/uncompare/zhuzhi/kog
~/soft/diamond/diamond blastx -d ~/database/KOG/kyva  -q  ../../sample1.fasta --sensitive  --evalue 1e-5 -o  sample1_kog_matches_fmt6.txt
35203 queries aligned.    #比對(duì)上的轉(zhuǎn)錄本數(shù)目
sort -k1,1 -k12,12nr sample1_kog_matches_fmt6.txt |awk  '!a[$1]++' >sample1_kog.fmt6       #下載到桌面
rm -f  sample1_kog_matches_fmt6.txt
cat sample1_kog.fmt6|cut -f1,2 >kog.id
從kog.txt中遍歷kog.id文件的第二列的每一行字符
vi serachkog.sh
cat kog.id| while read id;
do
         arr=(${id})
         geneid=${arr[1]}
grep  -w  "$geneid" /public/jychu/database/KOG/kog.txt >> kog.id.group
done
chmod a+x serachkog.sh
./serachkog.sh
wc -l kog.id.group
28977     #一共有28977個(gè)kogID可以注釋到功能層級(jí)
cat kog.id.group|tr "]" "\t"|cut -f1>sample1.kog.group    #提取功能描述的編號(hào)
cat sample1.kog.group|tr "\n" ","|sed "s/,//g" >row1    #把所有字符放在一行
wc -c row1      32764-1=32763個(gè)字符
把每一個(gè)字符分成一行
for((i=1;i<=32763;i++));
do
cut -b $i row1 >>sample1.kog.group.1row;
done
cat sample1.kog.group.1row|sort|uniq -c    #統(tǒng)計(jì)每個(gè)字符(Group 分類)出現(xiàn)的次數(shù)
  1577 A
    711 B
    577 C
   1079 D
    559 E
    326 F
    706 G
    162 H
    863 I
   1111 J
   2079 K
    694 L
    175 M
    114 N
   2685 O
    493 P
    163 Q
   4991 R
   2172 S
   5311 T
   2431 U
    281 V
   1005 W
     27 X
    424 Y
   2047 Z
--------------------------------------------------
繪圖
setwd("C:/Users/TE/Desktop/紹梅師姐/sample1/uncompare/數(shù)據(jù)庫(kù)注釋/kog")
library(ggplot2)
data<- read.table("kog.picture.txt",header=T, sep="\t")
CPCOLS <- c("#8DA1CB", "#FD8D62", "#66C3A5","#db7093")
data$Term=factor(data$Term,levels=data$Term) 
p <- ggplot(data=data, aes(y=Term, x=number, fill=group)) +
geom_bar(stat="identity", width=0.8) + coord_flip() +
scale_fill_manual(values = CPCOLS) + theme_bw() +
theme(axis.text=element_text(face = "bold", color="gray50"))+labs(title = "KOG Function Classification of Consensus Sequence",y = "Function Class",x = "Frequency")+theme(panel.grid.major =element_blank(), panel.grid.minor = element_blank())+theme(legend.position = "right")+theme(plot.title = element_text(hjust = 0.5,size=17,face="bold"))+theme(axis.title.x = element_text(size = 14, color = "black", face = "bold", vjust = 0.5, hjust = 0.5))+theme(axis.title.y = element_text(size = 14, color = "black", face = "bold", vjust = 0.5, hjust = 0.5))+
guides(fill = guide_legend(title = NULL))+theme(axis.text.x = element_text(angle = 288,size=12,face="bold", hjust = 0.02, vjust = 0.02))
ggsave("KOGpicture.tiff", p, width = 12, height=13)
========================================================
進(jìn)入桌面:
cd nr
cut -f1 sample1.nr.final.zs >nr.zhushi
cut -f1 ../kog/sample1_kog.fmt6 >../kog/kog.zhushi
cut -f1 ../uniport/sample1.uniport.fmt6 > ../uniport/uniport.zhushi
cat nr.zhushi ../uniport/uniport.zhushi ../kog/kog.zhushi ../kegg/kegg.zhushi|sort|uniq|wc -l
37215        #至少有一個(gè)數(shù)據(jù)庫(kù)注釋成功的轉(zhuǎn)錄本數(shù)目
  • lncRNA預(yù)測(cè)
1.CAPT預(yù)測(cè)
cd /public/jychu/sandai-goose/sample1/ccs/fl/flnc/clustered/uncompare
mkdir lncRNA
cd lncRNA
cpat.py -x /public/jychu/soft/CPAT/goose/goose_Hexamer.tsv -d /public/jychu/soft/CPAT/goose/goose.logit.RData -g ../sample1.fasta -o sample1.goose.CAPT.out
2. 統(tǒng)計(jì)nr數(shù)據(jù)庫(kù)和uniport數(shù)據(jù)庫(kù)沒有注釋到的轉(zhuǎn)錄本id
grep ">" ../sample1.fasta|sed "s/>//g"|tr " " "\t"|cut -f1 >transcript.all    #提取所有的轉(zhuǎn)錄本id名稱
cut -f1 ../zhushi/uniport/sample1.uniport.fmt6 >uniport.trans    #提取uniport注釋到的轉(zhuǎn)錄本id
cat transcript.all uniport.trans|sort|uniq -u > uniport.untrans   #提取uniport未比對(duì)到的轉(zhuǎn)錄本id     9436個(gè)
rm -f uniport.trans           #刪除無(wú)用文件
cut -f1 ../zhushi/nr/sample1.nr.fmt6 >nr.trans    #提取nr注釋到的轉(zhuǎn)錄本id
cat nr.trans transcript.all |sort|uniq -u > nr.untrans   #提取nr未比對(duì)到的轉(zhuǎn)錄本id    7864個(gè)
rm -f nr.trans
cd lncRNA
awk  '$2>=200' sample1.goose.CAPT.out.ORF_prob.tsv |cut -f1|tr "_" "\t"|cut -f1|sort|uniq > capt.alltrans    #提取ORF長(zhǎng)度大于200的所有轉(zhuǎn)錄本id
awk '$10>=0.1 && $2>=200' sample1.goose.CAPT.out.ORF_prob.tsv|cut -f1|tr "_" "\t"|cut -f1|sort|uniq > goose.unlncRNA     #提取編碼潛力大于0.1的轉(zhuǎn)錄本id
cat goose.unlncRNA capt.alltrans|sort|uniq -u > capt.untrans      #提取編碼潛力小于0.1,ORF長(zhǎng)度大于200的轉(zhuǎn)錄本id    11281個(gè)
rm -f  capt.alltrans  goose.unlncRNA
下載 uniport.untrans  nr.untrans  capt.untrans 到桌面
3.用CPC2軟件預(yù)測(cè)編碼潛能
conda activate python2  #cpc2腳本只能在python2環(huán)境下使用
python /public/jychu/soft/CPC2-beta/bin/CPC2.py -i ../sample1.fasta -o sample1.cpc2      #運(yùn)行命令
sample1.cpc2.txt             #輸出文件
grep "noncoding" sample1.cpc2.txt |cut -f1 >sample1.cpc2.lnc     #把預(yù)測(cè)為沒有編碼能力的轉(zhuǎn)錄本ID提取出來(lái),下載到桌面
wc -l sample1.cpc2.lnc
12831 sample1.cpc2.lnc     #一共預(yù)測(cè)到12831個(gè)沒有編碼能力的轉(zhuǎn)錄本
--------------------------------------------------------------------------------------------
在https://bioinfogp.cnb.csic.es/tools/venny/index.html 網(wǎng)站繪制韋恩圖
把交集的序列保存為sample1.lnc.all,在服務(wù)器中新建sample1.lnc.all文件,并把序列粘貼上去(如果直接把文件上傳上去,shell腳本識(shí)別不了該文件),下載到桌面上
conda deactivate
---------------------------------------------------------------------------------------------
4.構(gòu)建鵝的lncRNA數(shù)據(jù)庫(kù)索引,用blast進(jìn)行比對(duì)
cd /public/jychu/reference/index/hisat/goose
makeblastdb -in lncRNA.name.fa -parse_seqids  -dbtype nucl -out lncrna.db                  #建立索引
提取出所有的lncRNA序列
vi lnc.all.sh
cat sample1.lnc.all| while read id;
do
         arr=(${id})
         geneid=${arr[0]}
grep  -w -A1 "$geneid" ../sample1.fasta >> sample1.lnc.all.fa      #下載到桌面上
done
chmod a+x lnc.all.sh
./lnc.all.sh &
blastn -query sample1.lnc.all.fa -out sample1.goose.blast -db /public/jychu/reference/index/hisat/goose/lncrna.db -outfmt 6 -evalue 1e-5 -num_threads 40   #lncran序列與鵝的已知lncrna數(shù)據(jù)庫(kù)進(jìn)行比對(duì)
sort -k1,1 -k12,12nr sample1.goose.blast |awk  '!a[$1]++' >sample1.goose.lnc    #提取每個(gè)轉(zhuǎn)錄本第12列的最大值的行,下載到桌面上
rm -f sample1.goose.blast    #刪除無(wú)用的文件
cut -f1 sample1.goose.lnc|sort|uniq|wc -l
230   #一共有230個(gè)轉(zhuǎn)錄本比對(duì)到鵝的lncRNA上
cut -f2 sample1.goose.lnc|sort|uniq|wc -l
124  #一共比對(duì)上124個(gè)lncRNA
seqkit  seq  sample1.goose.CAPT.out.ORF_seqs.fa  -w  0  >  sample1.goose.CAPT.ORF.fa   #把序列轉(zhuǎn)換成一行
sed -i "s/_/ /g" sample1.goose.CAPT.ORF.fa   #把行首標(biāo)簽替換字符,以便下面腳本中序列名稱的匹配
vi lnc.all.orf.sh                           #提取lncRNA預(yù)測(cè)的ORF序列
cat sample1.lnc.all| while read id;
do
         arr=(${id})
         geneid=${arr[0]}
grep -i -w -A1 "$geneid" sample1.goose.CAPT.ORF.fa >> sample1.lncrna.ORF.fa         #下載到桌面上
done
chmod a+x lnc.all.orf.sh
./lnc.all.orf.sh
grep ">" sample1.lnc.all.fa|tr "=" "\t"|cut -f3 >sample1.lnc.all.length  #獲取序列長(zhǎng)度信息,并下載到桌面上
畫序列長(zhǎng)度分布圖
setwd("C:/Users/TE/Desktop/紹梅師姐/sample1/uncompare/lncRNA")
a<-read.table(file = "sample1.lnc.all.length")
a$V1<-as.integer(a$V1) 
library(ggplot2)
ggplot(data=a,aes(x=V1)) + 
geom_histogram(binwidth=200,fill="#6498C8", color="#e9ecef", alpha=0.9)+
theme_bw()+ scale_x_continuous(name="Read Length",limits=c(0, 15000))+
scale_y_continuous(name="count")
ggsave("sample1.lnc.all.length.tiff",height = 4,width = 8)
最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請(qǐng)結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡(jiǎn)書系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容