我喜歡的配色
常規(guī)作圖
一、移動(dòng)目錄下所有文件
find . -name "*.bam" -type f -exec mv {} ~/ChIP_seq/bam \;
find . -name "*.bai" -type f -exec mv {} ~/ChIP_seq/bam \;
find . -name "*.fastq.gz" -type f -exec mv {} ~/ChIP_seq/fastq \;
二、批量重命名
#去除_all_indel.bed之后的
ls *_all_indel.bed | while read id ;
do
mv $id `echo ${id%_all_indel.bed*}_all_indel.vcf`
done
#去除_之前的
ls *fastq.gz | while read id ;
do
mv $id `echo ${id#*_}`
done
#fq改成fastq
ls *.fq.gz | while read id ;
do
mv $id `echo ${id%.fq.gz*}.fastq.gz`
done
#Delete some files
for((i=1;i<=24;i++));
do
n="YM-${i}_S${i}*.gz"
rm $n
done
三、awk篩選提取大文件
替換fasta header
awk '{ if ($0~/^>/) { n=split($0, a, "|"); gsub(/_/," ", a[1]); printf("%s|%s\n", a[1], substr(a[2], 2)); } else { print $0; } }' ${mouse_fa_dir}/mouse.lncRNA.fa>${mouse_fa_dir}/mouse.lncRNA.renmae.fa
提取特定的reads,并替換bam文件的header
samtools view -h -@ ${thread} ${dir}/${i} | awk 'BEGIN{FS=OFS=\"\t\"} (/^@/ && !/@SQ/){print \$0} \$2~/^SN:mouse_chr[1-9]|^SN:mouse_chrX|^SN:mouse_chrY|^SN:mouse_chrM/{print \$0} \$3~/^mouse_chr[1-9]|X|Y|M/{print \$0} ' | sed 's/mouse_chr/chr/g' | samtools view -bS - >${mouse_dir}/${i%.bam*}.mouse.bam
samtools view -h -@ ${thread} ${dir}/${i} | awk 'BEGIN{FS=OFS=\"\t\"} (/^@/ && !/@SQ/){print \$0} \$2~/^SN:human_chr[1-9]|^SN:human_chrX|^SN:human_chrY|^SN:human_chrM/{print \$0} \$3~/^human_chr[1-9]|X|Y|M/{print \$0} ' | sed 's/human_chr/chr/g' | samtools view -bS - >${human_dir}/${i%.bam*}.human.bam
四、wget 用法
你應(yīng)該了解的所有wget命令
五、壓縮與解壓縮
壓縮與解壓縮
六、文件夾大小
Linux查看文件和文件夾大小
du -h --max-depth=1
七、 統(tǒng)計(jì)文件數(shù)目
統(tǒng)計(jì)文件數(shù)目2
ls -l | grep "^-" | wc -l #文件數(shù)
wc - lcw file1 #文件行數(shù)
八、csv文件轉(zhuǎn)換
刪除quote
ml csvkit
csvformat -T human_pc_cor.csv > human_pc_cor.csv
9、linux for 循環(huán)
例子1
for i in Fed1 Fed2 Fed3 Fasting5 Fasting6 Fasting7
do
echo $(cat ${data_dir}/${i}/${i}.m6A_output.bed | sort -k1,1 -k2,2n >${out_dir}/${i}.m6a.sorted.bed)
done
例子2
for i in {1..16..2}
do
j=`expr ${i} + 1`
job_file="${job_dir}/S${i}_S${j}.footprinting_diff.job"
echo "#!/bin/bash
#SBATCH --job-name=S${i}_S${j}.footprinting_diff
#SBATCH --output=${log_dir}/S${i}_S${j}.footprinting_diff.out
#SBATCH --time=10:00:00
#SBATCH --gres=lscratch:20
#SBATCH --cpus-per-task=${threads}
#SBATCH --mem=100g
module load rgt
diff_out_dir_sample=$diff_out_dir/S${i}_S${j}
mkdir -p \${diff_out_dir_sample}
rgt-hint differential --organism=mm10 --bc --nc 30 --mpbs-files=${match_out_dir}/CJ5439_S${i}_mpbs.bed,${match_out_dir}/CJ5439_S${j}_mpbs.bed \
--reads-files=${bam_dir}/CJ5439_S${i}.sort.bam,${bam_dir}/CJ5439_S${j}.sort.bam --conditions=CJ5439_S${i},CJ5439_S${j} \
--output-location=\${diff_out_dir_sample} --output-prefix=S${i}_S${j}
" > $job_file
sbatch $job_file
done
各類(lèi)文件轉(zhuǎn)換
R語(yǔ)言常用
R語(yǔ)言使用的一些技巧
R語(yǔ)言數(shù)據(jù)科學(xué)
Bugs
1.org.Hs.eg.db
Error: package or namespace load failed for ‘org.Hs.eg.db’:
.onLoad failed in loadNamespace() for 'org.Hs.eg.db', details:
call: l$contains
error: $ operator is invalid for atomic vectors
加上options(connectionObserver = NULL)就ok
Error in UseMethod("select") :
no applicable method for 'select' applied to an object of class "c('OrgDb', 'AnnotationDb', 'envRefClass', '.environment', 'refClass', 'environment', 'refObject', 'AssayData')"
使用:
AnnotationDbi::select(org.Hs.eg.db, keys=as.character(human_fraction_df_sig_genes), columns=c("SYMBOL","ENTREZID"), keytype="SYMBOL")
rm(list = ls())
options(stringsAsFactors=F)
options(scipen = 200)#取消科學(xué)記數(shù)法
#函數(shù)變量轉(zhuǎn)化
get()#返回與字符串同名的變量的值
assign()#為字符串變量的字符串賦值 eg: assign(x_name, read.table(file_name))
substitute()#將變量名轉(zhuǎn)化為同名字符串
#替換
gsub("Efg", "AAA", text) #將Efg改為AAA,區(qū)分大小寫(xiě)
#首字母大寫(xiě)
suppressMessages(library(stringr))
str_to_title(your_data, locale = "")
#刪除開(kāi)頭和結(jié)尾兩個(gè)字符
gsub('^..|..$', '', x)
1、獲取目錄下文件
mytsvfile = list.files(pattern="*.tsv")
list2env(
lapply(setNames(mytsvfile, make.names(gsub("*.tsv$", "", mytsvfile))),
read.table,header=T,row.names=1,check.names=FALSE,skip = 1), envir = .GlobalEnv)
files<-unlist(lapply(mytsvfile, FUN = function(x) {return(strsplit(x, split = ".tsv",fixed = T)[[1]][1])}))
2、隨機(jī)取數(shù)
combn()
3、批量運(yùn)算內(nèi)置文件輸出時(shí),input為數(shù)字(等待優(yōu)化)
library(parallel)
cl <- makeCluster(40)
results<-parSapply(cl,1:length(com_list),poly_met)
stopCluster(cl)
message("Finish")
4、技巧性代碼
#分割
human_overlap_lnc<-unlist(lapply(genelist, FUN = function(x) {return(strsplit(x, split = ".",fixed = T)[[1]][1])}))
unlist(strsplit(gene,"\\|",))[1]
#替換和截取字符
human_rnaseq_pick$sample<-as.character(gsub('Fed', 'AL', as.character(human_rnaseq_pick$sample), fixed=F))
human_rnaseq_pick$group<-as.character(substr(as.character(human_rnaseq_pick$sample),1,nchar(as.character(human_rnaseq_pick$sample))-1))
#list 操作
#把列表每個(gè)元素的""去掉
data.new<-lapply(data,function(x){x[which(x!="")]})
#批量解壓bz2文件
rm(list=ls())
suppressMessages(library(R.utils))
unzip_zb2<-function(out_dir,model_dir,prefix){
dir.create(out_dir,recursive = T)
for (i in 0:199) {
file<-paste0(model_dir,as.character(i),"/","farna_rebuild.out.bz2")
new_name <- paste0(out_dir,prefix,"_model_",as.character(i),".out") # 新文件名為去除.bz2后綴的原文件名
bunzip2(file, destname = new_name,remove=F) # 解壓縮文件
}
}
unzip_zb2(out_dir=human_out_dir,model_dir=human_dir,prefix = "human_24")
#自定義梯度顏色
rm(list=ls())
get_new_clor<-function(ratio,minVal,maxVal,color1,color2){
suppressMessages(library(circlize))
col_fun=colorRamp2(c(minVal,maxVal),c(color1,color2))
return(col_fun(ratio))
}
ratio=h24$X.struct_mfe
minVal <- min(ratio)
maxVal <- max(ratio)
color1 <- "white"
color2 <- "#CE86AF"
get_new_clor(ratio,minVal,maxVal,color1,color2)
5、ggplot2 相關(guān)
ggplot2——坐標(biāo)系篇
截?cái)?/a>
#1、坐標(biāo)軸標(biāo)簽分割及旋轉(zhuǎn),使用scales包進(jìn)行分割
suppressMessages(library(scales))
p<-ggplot(data=yourdata,aes(x = x, y = y))
p<-p+scale_y_discrete(labels = wrap_format(50))#Y軸五十個(gè)字符分割
#2、設(shè)置橫坐標(biāo)字體大小、顏色、旋轉(zhuǎn)等
p<-p+theme(axis.text.x = element_text(size = 15, family = "myFont", color = "green", face = "bold", vjust = 1, hjust = 1, angle = 45))
p<-p+theme(axis.text.x = element_text( vjust = 1, hjust = 1, angle = 45))
#3、去除背景,設(shè)置空白背景
p<-p+theme_bw()+theme_classic()#設(shè)置空白背景
#4、保存
filename=paste0(your_dir,"/signature.pdf")
ggsave(filename,p,units = "in",width=6,height = 4.5,device = "pdf")
#5、去除x軸刻度
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
#7、坐標(biāo)軸截?cái)?human_new_df$"type"<-factor(human_new_df$type, levels=unique(human_new_df$type))
human_new_df$"treatment"<-factor(human_new_df$treatment, levels=unique(human_new_df$treatment))
p1<-ggboxplot(human_new_df, x = "type", y = "polya_length",
palette =c("#4DBBD5FF","#E64B35FF"),fill = "treatment",legend = "none",
ylab = "Poly(A) length")+coord_cartesian(ylim = c(0,160))+
theme(axis.text.x = element_text( vjust = 1, hjust = 1, angle = 45))
p2<-ggboxplot(human_new_df, x = "type", y = "polya_length",
palette =c("#4DBBD5FF","#E64B35FF"),fill = "treatment")+
labs(x=NULL,y=NULL,fill=NULL) +
theme(axis.text.x = element_blank(),axis.ticks.x = element_blank(),axis.line.x= element_blank())+
coord_cartesian(ylim = c(600,800))+scale_y_continuous(breaks = c(600,800,100))
ggarrange(p2,p1,heights=c(1/8, 7/8),ncol = 1, nrow = 2,common.legend = TRUE,legend="top",align = "v")
6、ggpubr 相關(guān)問(wèn)題
固定組別順序,使出的圖按照固定順序排列
#固定group順序
signature$Group <- factor(signature$Group, levels=unique(signature$Group))
ggpubr 統(tǒng)計(jì)相關(guān)問(wèn)題
有關(guān)兩組之間大比較如何加星號(hào)

注意圖中加深部位不能有雙引號(hào),特別在批量定義函數(shù)時(shí)候,要用原始值不能用函數(shù)
我的例子
例子1
new_data<-new_ccle_data %>% gather(key=Gene_name, value=expression_levels,-pick_levels)
colnames(new_data)<-c("Levels","Gene_name","Relative_expression_levels")
filename=paste0(data_dir,"/",j,"_expression_levels_in_",i,"_Median.pdf")
pdf(filename,width=3.5,height=5)
p <- ggboxplot(new_data, x = "Gene_name", y = "Relative_expression_levels",
color = "Levels", palette ="jco",
shape = "Levels",group="Levels")
p<-p + stat_compare_means(aes(group = Levels),label = "p.signif",hide.ns=F, paired=F,method = "t.test")
plot(p)
dev.off()
#例子2
data<-degs %>% left_join(pcr,by="New_name")
data<-data[complete.cases(data$Relative_Exp),]
p <- ggbarplot(data, x = "New_name", y = "Relative_Exp",
color = "Treatment", palette = c("#4DBBD5","#E64B35"),
ylab = "Relative Expression Levels",position = position_dodge(0.9),
fill = "Treatment",alpha=0.2,add = c("mean_se", "jitter"),add.params = list(size=0.5))
p<-p + stat_compare_means(aes(group = Treatment),label = "p.signif",hide.ns=F, paired=F,method = "t.test")
p<-p+theme(axis.text.x = element_text(size = 7, vjust = 0.7, hjust =0.7, angle = 45))
#例子3
p<-ggboxplot(data_df, x = "group", y = "polya_length",
color = "group", palette ="npg",
shape = "group",ylab = "Poly(A) Tail Length")+
facet_grid(~new_id)
my_comparisons <- list(c("DMSO", "PPARa"),c("DMSO", "PPARg"),c("DMSO", "FXR"))
p<-p+ stat_compare_means(comparisons=my_comparisons,
label = "p.signif",hide.ns=F, paired=F,method = "t.test") +
theme(
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x=element_blank()
)
plot(p)
8、tidyverse使用
R語(yǔ)言長(zhǎng)短數(shù)據(jù)轉(zhuǎn)換
suppressMessages(library(tidyverse))
#1.篩選
rt<-df %>% filter(pvalue<0.05)
#2.長(zhǎng)變短
rt<-df %>% gather(key="New_name",value="New_name",-不變的列)
#3.短變長(zhǎng)
rt<-df %>% spread(key="old_name",value="old_name",-不變的列)
#4.合并
combind<-to_df %>%
left_join(df, by="pathway")
#5. 逐級(jí)排序
out_tab<-arrange(out_tab,desc(year),desc(month),desc(day))
arrange(out_tab,match(year, c("C","A","B")), desc(Res), desc(Pop))#指定順序排序
#6.管道統(tǒng)計(jì)作圖
data %>%
group_by(seurat_clusters,orig.ident) %>%
count() %>%
group_by(seurat_clusters) %>%
mutate(percent=100*n/sum(n)) %>%
ungroup() %>%
ggbarplot(x="seurat_clusters",y="percent", fill="orig.ident",color = "black",palette = "jco")
9、獲取GO Term的子孫后代
go_term<-read.csv("~/GO_BP_superfamily.csv",header = F)
suppressMessages(library("plyr"))
suppressMessages(library(GO.db))
list <- list()
for (i in 1:nrow(go_term)){
rt<-as.list(GOBPOFFSPRING[as.character(go_term[i,2])])
rrt<-as.matrix(rt[[1]])
colnames(rrt)<-as.character(go_term[i,1])
list[[i]] <- data.frame(rrt)
}
df<-rbind.fill(list)
write.csv(df,"~/GO_BP_superfamily_Offspring.csv")
library(tidyverse)
library(purrrlyr)
row_handler <- function(row.data){
index <- which(row.data == max(row.data)) # 找出最大的元素的index
out <- names(row.data[index]) %>% # 從index還原成列名
str_c(collapse = ",") # 拼接
return(out)
}
major_path<-nafld_new_df %>%
by_row(..f = row_handler, .collate = "rows", .to = "major_pathway")
11、批量設(shè)組
library(Hmisc)
group <- factor(gsub("(fed|fasting).*", "\\1", as.character(rt$Sample)), levels = c("fed", "fasting")))
treatment<-as.character(group)
data_pick$"Treatment"<-capitalize(treatment)
12、提取兩字符間文字
如何在R中的兩個(gè)字符之間提取文本
笨方法分割兩次,比如說(shuō)我要提取>和空格之間的數(shù)據(jù),分兩步切割,第一次提取空格之前的,第二部提取>之后的,寫(xiě)個(gè)函數(shù)就行
get_list<-function(rt){
new_rt<-unlist(lapply(as.character(rt[,1]), FUN = function(x) {return(strsplit(x, split = " ",fixed = T)[[1]][1])}))
new_rt<-unlist(lapply(as.character(new_rt), FUN = function(x) {return(strsplit(x, split = ">",fixed = T)[[1]][2])}))
return(new_rt)
}
13、去除NA值
# 除了使用na.omit()以外
rna_deg<-rna_deg[complete.cases(rna_deg[,1:5]),]
14、統(tǒng)計(jì)方法
Mann-Whitney
15、overlap不使用intersect
首先看看多組intersect
human_overlap_list<-Reduce(intersect, list(human_fast_sig_list,human_ppara_sig_list,human_pparg_sig_list,human_fxr_sig_list))
intersect 做overlap會(huì)去掉重復(fù)值所以可以用:
b[is.element(b,a)#找出b中與a overlap的元素
get_list<-function(a,b){b[is.element(b,a)]}
16、返回ENTREZID
suppressMessages(library(clusterProfiler))
suppressMessages(library(tidyverse))
suppressMessages(library(org.Hs.eg.db))
suppressMessages(library(scales))
get_gene_list<-function(list){
genes<-as.character(list)
genes<-genes[!duplicated(genes)]
gene_list<-select(org.Hs.eg.db, keys=genes, columns=c("SYMBOL","ENTREZID"), keytype="SYMBOL")
gene_list<-gene_list[!duplicated(gene_list$SYMBOL),]
return(na.omit(as.character(gene_list$ENTREZID)))
}
Snakemake注意點(diǎn)
1.轉(zhuǎn)義問(wèn)題
