生信常用基礎(chǔ)語(yǔ)法及技巧

我喜歡的配色
常規(guī)作圖
一、移動(dòng)目錄下所有文件

find . -name "*.bam" -type f -exec mv {} ~/ChIP_seq/bam \; 
find . -name "*.bai" -type f -exec mv {} ~/ChIP_seq/bam \; 
find . -name "*.fastq.gz" -type f -exec mv {} ~/ChIP_seq/fastq \; 

二、批量重命名

#去除_all_indel.bed之后的
ls *_all_indel.bed | while read id ;
do
mv $id `echo ${id%_all_indel.bed*}_all_indel.vcf`
done
#去除_之前的
ls *fastq.gz | while read id ;
do
mv $id `echo ${id#*_}`
done
#fq改成fastq
ls *.fq.gz | while read id ;
do
mv $id `echo ${id%.fq.gz*}.fastq.gz`
done

#Delete some files

for((i=1;i<=24;i++));  
do   
n="YM-${i}_S${i}*.gz"
rm $n
done  

三、awk篩選提取大文件
替換fasta header

awk '{ if ($0~/^>/) { n=split($0, a, "|"); gsub(/_/," ", a[1]); printf("%s|%s\n", a[1], substr(a[2], 2)); } else { print $0; } }' ${mouse_fa_dir}/mouse.lncRNA.fa>${mouse_fa_dir}/mouse.lncRNA.renmae.fa

提取特定的reads,并替換bam文件的header

samtools view -h -@ ${thread} ${dir}/${i} | awk 'BEGIN{FS=OFS=\"\t\"} (/^@/ && !/@SQ/){print \$0} \$2~/^SN:mouse_chr[1-9]|^SN:mouse_chrX|^SN:mouse_chrY|^SN:mouse_chrM/{print \$0}  \$3~/^mouse_chr[1-9]|X|Y|M/{print \$0} ' | sed 's/mouse_chr/chr/g' | samtools view -bS - >${mouse_dir}/${i%.bam*}.mouse.bam
samtools view -h -@ ${thread} ${dir}/${i} | awk 'BEGIN{FS=OFS=\"\t\"} (/^@/ && !/@SQ/){print \$0} \$2~/^SN:human_chr[1-9]|^SN:human_chrX|^SN:human_chrY|^SN:human_chrM/{print \$0}  \$3~/^human_chr[1-9]|X|Y|M/{print \$0} ' | sed 's/human_chr/chr/g' | samtools view -bS - >${human_dir}/${i%.bam*}.human.bam

四、wget 用法
你應(yīng)該了解的所有wget命令
五、壓縮與解壓縮
壓縮與解壓縮
六、文件夾大小
Linux查看文件和文件夾大小

du -h --max-depth=1

七、 統(tǒng)計(jì)文件數(shù)目
統(tǒng)計(jì)文件數(shù)目2

ls -l | grep "^-" | wc -l #文件數(shù)
wc - lcw file1 #文件行數(shù)

八、csv文件轉(zhuǎn)換
刪除quote

ml csvkit
csvformat -T human_pc_cor.csv > human_pc_cor.csv

9、linux for 循環(huán)

例子1
for i in Fed1 Fed2 Fed3 Fasting5 Fasting6 Fasting7
do
echo $(cat ${data_dir}/${i}/${i}.m6A_output.bed | sort -k1,1 -k2,2n  >${out_dir}/${i}.m6a.sorted.bed)
done

例子2
for i in {1..16..2} 
do
j=`expr ${i} + 1`
job_file="${job_dir}/S${i}_S${j}.footprinting_diff.job"

    echo "#!/bin/bash
#SBATCH --job-name=S${i}_S${j}.footprinting_diff
#SBATCH --output=${log_dir}/S${i}_S${j}.footprinting_diff.out
#SBATCH --time=10:00:00
#SBATCH --gres=lscratch:20
#SBATCH --cpus-per-task=${threads}
#SBATCH --mem=100g
module load rgt
diff_out_dir_sample=$diff_out_dir/S${i}_S${j}
mkdir -p \${diff_out_dir_sample}
rgt-hint differential --organism=mm10 --bc --nc 30 --mpbs-files=${match_out_dir}/CJ5439_S${i}_mpbs.bed,${match_out_dir}/CJ5439_S${j}_mpbs.bed \
--reads-files=${bam_dir}/CJ5439_S${i}.sort.bam,${bam_dir}/CJ5439_S${j}.sort.bam --conditions=CJ5439_S${i},CJ5439_S${j} \
--output-location=\${diff_out_dir_sample} --output-prefix=S${i}_S${j}
" > $job_file
sbatch $job_file
done

各類(lèi)文件轉(zhuǎn)換

R語(yǔ)言常用

R語(yǔ)言使用的一些技巧
R語(yǔ)言數(shù)據(jù)科學(xué)

Bugs

1.org.Hs.eg.db
Error: package or namespace load failed for ‘org.Hs.eg.db’:
 .onLoad failed in loadNamespace() for 'org.Hs.eg.db', details:
  call: l$contains
  error: $ operator is invalid for atomic vectors

加上options(connectionObserver = NULL)就ok

Error in UseMethod("select") : 
  no applicable method for 'select' applied to an object of class "c('OrgDb', 'AnnotationDb', 'envRefClass', '.environment', 'refClass', 'environment', 'refObject', 'AssayData')"
使用:
AnnotationDbi::select(org.Hs.eg.db, keys=as.character(human_fraction_df_sig_genes), columns=c("SYMBOL","ENTREZID"), keytype="SYMBOL") 

rm(list = ls())
options(stringsAsFactors=F)
options(scipen = 200)#取消科學(xué)記數(shù)法

#函數(shù)變量轉(zhuǎn)化
get()#返回與字符串同名的變量的值
assign()#為字符串變量的字符串賦值 eg: assign(x_name, read.table(file_name))
substitute()#將變量名轉(zhuǎn)化為同名字符串
#替換
gsub("Efg", "AAA", text) #將Efg改為AAA,區(qū)分大小寫(xiě)

#首字母大寫(xiě)
suppressMessages(library(stringr))
str_to_title(your_data, locale = "")
#刪除開(kāi)頭和結(jié)尾兩個(gè)字符
gsub('^..|..$', '', x)

1、獲取目錄下文件

mytsvfile = list.files(pattern="*.tsv")   
list2env(
 lapply(setNames(mytsvfile, make.names(gsub("*.tsv$", "", mytsvfile))),
        read.table,header=T,row.names=1,check.names=FALSE,skip = 1), envir = .GlobalEnv)
files<-unlist(lapply(mytsvfile, FUN = function(x) {return(strsplit(x, split = ".tsv",fixed = T)[[1]][1])}))

2、隨機(jī)取數(shù)

combn()

3、批量運(yùn)算內(nèi)置文件輸出時(shí),input為數(shù)字(等待優(yōu)化)

library(parallel)
cl <- makeCluster(40)
results<-parSapply(cl,1:length(com_list),poly_met)
stopCluster(cl)
message("Finish")

4、技巧性代碼

#分割
human_overlap_lnc<-unlist(lapply(genelist, FUN = function(x) {return(strsplit(x, split = ".",fixed = T)[[1]][1])}))
unlist(strsplit(gene,"\\|",))[1]

#替換和截取字符

human_rnaseq_pick$sample<-as.character(gsub('Fed', 'AL', as.character(human_rnaseq_pick$sample), fixed=F))
human_rnaseq_pick$group<-as.character(substr(as.character(human_rnaseq_pick$sample),1,nchar(as.character(human_rnaseq_pick$sample))-1))

#list 操作
#把列表每個(gè)元素的""去掉
data.new<-lapply(data,function(x){x[which(x!="")]})

#批量解壓bz2文件
rm(list=ls())
suppressMessages(library(R.utils))

unzip_zb2<-function(out_dir,model_dir,prefix){
  dir.create(out_dir,recursive = T)
  for (i in 0:199) {
    file<-paste0(model_dir,as.character(i),"/","farna_rebuild.out.bz2")
    new_name <- paste0(out_dir,prefix,"_model_",as.character(i),".out") # 新文件名為去除.bz2后綴的原文件名
    bunzip2(file, destname = new_name,remove=F) # 解壓縮文件
  }
}

unzip_zb2(out_dir=human_out_dir,model_dir=human_dir,prefix = "human_24")

#自定義梯度顏色
rm(list=ls())
get_new_clor<-function(ratio,minVal,maxVal,color1,color2){
  suppressMessages(library(circlize))
  col_fun=colorRamp2(c(minVal,maxVal),c(color1,color2))
  return(col_fun(ratio))
}
ratio=h24$X.struct_mfe
minVal <- min(ratio)
maxVal <- max(ratio)
color1 <- "white"  
color2 <- "#CE86AF"  
get_new_clor(ratio,minVal,maxVal,color1,color2)

5、ggplot2 相關(guān)
ggplot2——坐標(biāo)系篇
截?cái)?/a>

#1、坐標(biāo)軸標(biāo)簽分割及旋轉(zhuǎn),使用scales包進(jìn)行分割
suppressMessages(library(scales))
p<-ggplot(data=yourdata,aes(x = x, y = y))
p<-p+scale_y_discrete(labels = wrap_format(50))#Y軸五十個(gè)字符分割

#2、設(shè)置橫坐標(biāo)字體大小、顏色、旋轉(zhuǎn)等
p<-p+theme(axis.text.x = element_text(size = 15, family = "myFont", color = "green", face = "bold", vjust = 1, hjust = 1, angle = 45))
p<-p+theme(axis.text.x = element_text( vjust = 1, hjust = 1, angle = 45))
#3、去除背景,設(shè)置空白背景
p<-p+theme_bw()+theme_classic()#設(shè)置空白背景

#4、保存
filename=paste0(your_dir,"/signature.pdf")
ggsave(filename,p,units = "in",width=6,height = 4.5,device = "pdf")

#5、去除x軸刻度
theme(axis.title.x=element_blank(),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())
#7、坐標(biāo)軸截?cái)?human_new_df$"type"<-factor(human_new_df$type, levels=unique(human_new_df$type))
human_new_df$"treatment"<-factor(human_new_df$treatment, levels=unique(human_new_df$treatment))
p1<-ggboxplot(human_new_df,  x = "type", y = "polya_length",
             palette =c("#4DBBD5FF","#E64B35FF"),fill = "treatment",legend = "none",
             ylab = "Poly(A) length")+coord_cartesian(ylim = c(0,160))+
            theme(axis.text.x = element_text( vjust = 1, hjust = 1, angle = 45))
p2<-ggboxplot(human_new_df,  x = "type", y = "polya_length",
              palette =c("#4DBBD5FF","#E64B35FF"),fill = "treatment")+
              labs(x=NULL,y=NULL,fill=NULL) +
              theme(axis.text.x = element_blank(),axis.ticks.x = element_blank(),axis.line.x= element_blank())+
              coord_cartesian(ylim = c(600,800))+scale_y_continuous(breaks = c(600,800,100))  
ggarrange(p2,p1,heights=c(1/8, 7/8),ncol = 1, nrow = 2,common.legend = TRUE,legend="top",align = "v") 

6、ggpubr 相關(guān)問(wèn)題
固定組別順序,使出的圖按照固定順序排列

#固定group順序
signature$Group <- factor(signature$Group, levels=unique(signature$Group))

ggbarplot組別分開(kāi)

ggpubr加p

ggpubr 統(tǒng)計(jì)相關(guān)問(wèn)題
有關(guān)兩組之間大比較如何加星號(hào)

image.png

注意圖中加深部位不能有雙引號(hào),特別在批量定義函數(shù)時(shí)候,要用原始值不能用函數(shù)
我的例子

例子1
  new_data<-new_ccle_data %>% gather(key=Gene_name, value=expression_levels,-pick_levels)
  colnames(new_data)<-c("Levels","Gene_name","Relative_expression_levels")
    filename=paste0(data_dir,"/",j,"_expression_levels_in_",i,"_Median.pdf")
    pdf(filename,width=3.5,height=5)
    p <- ggboxplot(new_data, x = "Gene_name", y = "Relative_expression_levels",
                   color = "Levels", palette ="jco",
                    shape = "Levels",group="Levels")
    p<-p + stat_compare_means(aes(group = Levels),label = "p.signif",hide.ns=F, paired=F,method = "t.test")

    plot(p)
    dev.off() 

#例子2
data<-degs %>% left_join(pcr,by="New_name")
data<-data[complete.cases(data$Relative_Exp),]
p <- ggbarplot(data, x = "New_name", y = "Relative_Exp",
               color = "Treatment", palette = c("#4DBBD5","#E64B35"),
               ylab = "Relative Expression Levels",position = position_dodge(0.9),
               fill = "Treatment",alpha=0.2,add = c("mean_se", "jitter"),add.params = list(size=0.5))
p<-p + stat_compare_means(aes(group = Treatment),label = "p.signif",hide.ns=F, paired=F,method = "t.test")
p<-p+theme(axis.text.x = element_text(size = 7,  vjust = 0.7, hjust =0.7, angle = 45))

#例子3
      p<-ggboxplot(data_df, x = "group", y = "polya_length",
                   color = "group", palette ="npg",
                   shape = "group",ylab = "Poly(A) Tail Length")+
        facet_grid(~new_id)
      
      my_comparisons <- list(c("DMSO", "PPARa"),c("DMSO", "PPARg"),c("DMSO", "FXR"))
      p<-p+ stat_compare_means(comparisons=my_comparisons,
                               label = "p.signif",hide.ns=F, paired=F,method = "t.test") +
        theme(
          axis.title.x = element_blank(),
          axis.text.x = element_blank(),
          axis.ticks.x=element_blank()
        )
      plot(p)

8、tidyverse使用
R語(yǔ)言長(zhǎng)短數(shù)據(jù)轉(zhuǎn)換

suppressMessages(library(tidyverse))

 #1.篩選
rt<-df %>% filter(pvalue<0.05)

#2.長(zhǎng)變短
rt<-df %>% gather(key="New_name",value="New_name",-不變的列)

#3.短變長(zhǎng)

rt<-df %>% spread(key="old_name",value="old_name",-不變的列)

#4.合并

combind<-to_df %>% 
  left_join(df, by="pathway")

#5. 逐級(jí)排序
out_tab<-arrange(out_tab,desc(year),desc(month),desc(day))
arrange(out_tab,match(year, c("C","A","B")), desc(Res), desc(Pop))#指定順序排序
#6.管道統(tǒng)計(jì)作圖
data %>%

  group_by(seurat_clusters,orig.ident) %>%

  count() %>%

  group_by(seurat_clusters) %>%

  mutate(percent=100*n/sum(n)) %>%

  ungroup() %>%

  ggbarplot(x="seurat_clusters",y="percent", fill="orig.ident",color = "black",palette = "jco")

9、獲取GO Term的子孫后代

go_term<-read.csv("~/GO_BP_superfamily.csv",header = F)
suppressMessages(library("plyr"))
suppressMessages(library(GO.db))
list <- list()
for (i in 1:nrow(go_term)){
  rt<-as.list(GOBPOFFSPRING[as.character(go_term[i,2])])
  rrt<-as.matrix(rt[[1]])
  colnames(rrt)<-as.character(go_term[i,1])
  list[[i]] <- data.frame(rrt)
}
df<-rbind.fill(list)
write.csv(df,"~/GO_BP_superfamily_Offspring.csv")

10、返回最大值列名
參考
參考2

library(tidyverse)
library(purrrlyr)

row_handler <- function(row.data){  
  index <- which(row.data == max(row.data))  # 找出最大的元素的index
  out <- names(row.data[index]) %>% # 從index還原成列名
    str_c(collapse = ",") # 拼接
  return(out)
}
major_path<-nafld_new_df %>%
  by_row(..f = row_handler, .collate = "rows", .to = "major_pathway")

11、批量設(shè)組

library(Hmisc)
group <- factor(gsub("(fed|fasting).*", "\\1", as.character(rt$Sample)), levels = c("fed", "fasting")))
treatment<-as.character(group)
data_pick$"Treatment"<-capitalize(treatment)

12、提取兩字符間文字
如何在R中的兩個(gè)字符之間提取文本
笨方法分割兩次,比如說(shuō)我要提取>和空格之間的數(shù)據(jù),分兩步切割,第一次提取空格之前的,第二部提取>之后的,寫(xiě)個(gè)函數(shù)就行

get_list<-function(rt){
  new_rt<-unlist(lapply(as.character(rt[,1]), FUN = function(x) {return(strsplit(x, split = " ",fixed = T)[[1]][1])}))
  new_rt<-unlist(lapply(as.character(new_rt), FUN = function(x) {return(strsplit(x, split = ">",fixed = T)[[1]][2])}))
  return(new_rt)
}

13、去除NA值

# 除了使用na.omit()以外
rna_deg<-rna_deg[complete.cases(rna_deg[,1:5]),]

14、統(tǒng)計(jì)方法
Mann-Whitney

15、overlap不使用intersect
首先看看多組intersect

human_overlap_list<-Reduce(intersect, list(human_fast_sig_list,human_ppara_sig_list,human_pparg_sig_list,human_fxr_sig_list))

intersect 做overlap會(huì)去掉重復(fù)值所以可以用:

b[is.element(b,a)#找出b中與a overlap的元素
get_list<-function(a,b){b[is.element(b,a)]}

16、返回ENTREZID
suppressMessages(library(clusterProfiler))
suppressMessages(library(tidyverse))
suppressMessages(library(org.Hs.eg.db))
suppressMessages(library(scales))
get_gene_list<-function(list){
  genes<-as.character(list)
  genes<-genes[!duplicated(genes)]
  gene_list<-select(org.Hs.eg.db, keys=genes, columns=c("SYMBOL","ENTREZID"), keytype="SYMBOL")
  gene_list<-gene_list[!duplicated(gene_list$SYMBOL),]
  return(na.omit(as.character(gene_list$ENTREZID)))
}

Snakemake注意點(diǎn)
1.轉(zhuǎn)義問(wèn)題

轉(zhuǎn)義字符

最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請(qǐng)結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡(jiǎn)書(shū)系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

  • WinRAR - 最新版本的更新 版本 5.50 1. WinRAR 和命令行 RAR 默認(rèn)使用 RAR ...
    王舒璇閱讀 2,517評(píng)論 0 2
  • 這是Linux的操作系統(tǒng),使用的VMware 安裝的虛擬機(jī)的操作系統(tǒng) 切換用戶su su -切換回root目錄(需...
    wo_monic閱讀 856評(píng)論 0 0
  • 進(jìn)入帶空格的文件或者文件夾 Linux文件權(quán)限詳解 文件和目錄權(quán)限概述 在linux中的每一個(gè)文件或目錄都包含有訪...
    annkee閱讀 2,798評(píng)論 0 4
  • 本文筆記源自這里——[實(shí)驗(yàn)樓]歡迎大家在下面交流其中有問(wèn)題的地方喜歡請(qǐng)點(diǎn)收藏,每日更新(全部已親自實(shí)踐). 一. ...
    東皇Amrzs閱讀 4,334評(píng)論 7 54
  • 一、文件/文件夾管理 ls 列出當(dāng)前目錄文件(不包括隱含文件)ls -a 列出當(dāng)前目錄文件(包括隱含文件)l...
    路癡千行閱讀 2,643評(píng)論 0 5

友情鏈接更多精彩內(nèi)容