RNA-Seq Analysis Workflow with Snakemake

public public 1yr ago 0 bookmarks

RNA-seq workflow

1. 基本信息

流程管理 : snakemake

2 主要软件

软件 版本 功能
mulitqc 数据质控
salmon RNA-Seq 定量
DEseq2 差异分析

3 概要设计

3.1 目录及参数文件设计

3.1.1 目录

.
├── README.md #流程描述文件
├── config.yaml #流程参数文件
├── samples.tsv #fastq 信息
├── meta.tsv #样本分组信息文件
├── schemas #参数配置文件规范 ├── config.schema.yaml
│ ├── samples.schema.yaml
│ └── meta.schema.yaml
├── fastq #存储数据软链接
├── report #存储报告
├── result #存储中间文件 ├── 01_qc
│ ├── 02_quant
│ ├── 03_diffexp
├── rules #各部分流程snakefile  ├── qc.smk
│ ├── quant.smk
│ ├── diffexp.smk
├── scripts #存储python R脚本
└── Snakefile #main snakefile

3.1.2 config.yaml

#项目名称
PROJECT: 
#项目负责人
PERSON: 
#实验、文库信息
LIBRARYINFO: config/libraryinfo.tsv 
####################
# 输入输出
####################
#数据信息表
meta: meta.tsv
#样品信息表
samples: samples.tsv
#数据路径
DATA_DIR: fastq/
#输出路径
working_dir: result/ #工作目录
report_dir: report/ #报告路径
####################
# 分析参数
####################
#参考基因信息
ref:
 # 索引文件
 index: 
 # 注释文件
 annotation: 
# 差异分析时使用的对比组。
diffexp:
 contrasts:
 treated-vs-untreated:
 - treated
 - untreated

3.1.3 meta.tsv

<sample> <condition>
 A treated
 B untreated

3.1.4 samples.tsv

<sample> <read1.fq> <read2.fq>
 A fq1 fq2
 A fq1 fq2
 B fq1 fq2 

Code Snippets

23
24
script:
    "../scripts/deseq2.R"
42
43
script:
    "../scripts/output.Rmd"
 7
 8
 9
10
11
12
13
14
15
16
    shell:
        "fastqc {input} -o %s -t %s"%(qc_dir,global_thread)

rule multiqc:
    input:
        expand("{outdir}/{sample}_R{read}_fastqc.zip",outdir=qc_dir,sample= samples.Sample,read=[1,2])
    output:
        join(qc_dir,"multiqc_report.html")
    wrapper:
        "0.60.7/bio/multiqc"
12
13
wrapper:
    "0.60.7/bio/salmon/index"
33
34
wrapper:
    "0.60.7/bio/salmon/quant"
45
46
script:
    "../scripts/tximport.R"
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library(tximport)
library(readr)
#library(EnsDb.Hsapiens.v86)
library(stringr)
library(DESeq2)
library(dplyr)
library(tidyverse)
library(rtracklayer)

#配置
parallel <- FALSE
if (snakemake@threads > 1) {
    library("BiocParallel")
    register(MulticoreParam(snakemake@threads))
    parallel <- TRUE
}

allfiles <- snakemake@input[['quant_files']]
contrast<-snakemake@params[['contrast']]
meta<-read.table(snakemake@params[['meta']],header=T,sep='\t')
gtf<-snakemake@params[['gtf']]

gtfinfo<-rtracklayer::import(gtf)

#按照meta中Group分组输出差异分析结果
names(allfiles)<- basename(dirname(allfiles))
meta$Condition <- factor(meta$Condition)
meta$Group <- factor(meta$Group)
rownames(meta) <- meta$Sample
samples<-meta%>%
    tibble::as_tibble()%>%
    dplyr::filter(Condition%in%contrast)
print(samples)

files=allfiles[samples$Sample]

tx2gene <- gtfinfo%>%
  tibble::as_tibble() %>% 
  dplyr::select(c(transcript_id,gene_id))%>%
  drop_na()%>%
  distinct()
colnames(tx2gene)<-c("TXNAME",'GENEID')

if (!all(file.exists(files))){
  print(files)
}

#save.image()


txi <- tximport(files, type="salmon", tx2gene=tx2gene,ignoreTxVersion=T)

#@@@@@@@@@@@@@@@@差异分析@@@@@@@@@@@@@@@@@@@@@@@#
ddsTxi <- DESeqDataSetFromTximport(txi,colData = samples,design = ~ Condition)
ref=contrast[1]
ddsTxi$Condition <- relevel(ddsTxi$Condition, ref = ref)
if (length(contrast)>2){
    dds <- DESeq(ddsTxi, parallel=parallel)
}else{
    dds <- DESeq(ddsTxi,parallel=TRUE,test='LRT',reduced=~1)
}
#@@@@@@@@@@@@@@@@获取结果、ID转换@@@@@@@@@@@@@@@@@@@@@@@#



geneMaptmp <- gtfinfo %>% 
  tibble::as_tibble() %>% 
  dplyr::filter(type=="gene") 
if ('gene_name' %in% colnames(geneMaptmp)){
  geneMap<-geneMaptmp%>%
          dplyr::select(c(gene_name,gene_id)) 
}else if ('gene_symbol' %in% colnames(geneMaptmp)){
  geneMap<-geneMaptmp%>%
          dplyr::select(c(gene_symbol,gene_id))
} else{
  geneMap<-geneMaptmp%>%
        dplyr::select(c(gene_id,gene_id))
}
colnames(geneMap)<-c("gene_name",'gene_id')

id_trans <-function(data_input){
  data_input <-data_input%>%
    as.data.frame()%>%
    tibble::rownames_to_column(var='gene') %>% 
    dplyr::left_join(geneMap,by=c('gene'='gene_id'))%>% 
    dplyr::select(gene,gene_name, everything())
} 

get_result<-function(dds,resultname){
  #res <- results(dds, contrast=contrast,  parallel=parallel)
  res <- lfcShrink(dds, coef=resultname, type="apeglm",parallel=parallel)
  res%>%
    id_trans()%>%
    mutate(contrast=resultname)
}

if (length(resultsNames(dds))>2){
    test<-'LRT'
}else{
    test<-'Wald'
}
name=resultsNames(dds)[2]
res=get_result(dds,name)
if (test=='LRT'){
  for (i in 3:length(resultsNames(dds))){
    name=resultsNames(dds)[i]
    tmp=get_result(dds,name)
    res<-res%>%
        bind_rows(tmp)
  }
}

#@@@@@@@@@@@@@@@@表达矩阵@@@@@@@@@@@@@@@@@@@@@@@#

## 输出矫正后表达量,样本量小于30时采用rlog进行转化,否则采用vsd
if (dim(colData(dds))[1]<30){
    ndds <- rlog(dds, blind=FALSE)
} else {
    ndds <- vst(dds, blind=FALSE)
}
exp <- as.data.frame(assay(ndds)) %>% 
  id_trans() %>% 
  type_convert()


#@@@@@@@@@@@@@@@@输出@@@@@@@@@@@@@@@@@@@@@@@#

Name <-snakemake@params[['Name']]
outdir<-snakemake@params[['outdir']]
outdir<-paste0(outdir,'/')
DESeq2_res_file=paste0(outdir,Name,'.DESeq2_res.tsv')
exp_file=paste0(outdir,Name,'.DESeq2_exp.tsv')
meta_file=paste0(outdir,Name,'.DESeq2_meta.tsv')

write_tsv(samples,meta_file)
write_tsv(exp,exp_file)
write_tsv(res,DESeq2_res_file)
saveRDS(ndds, file=snakemake@output[['deseq2_rds']])
19
knitr::opts_chunk$set(eval=TRUE,echo = FALSE,warning=FALSE,message=FALSE,fig.height=6, fig.width=8,fig.align='center')
28
29
30
31
32
33
htmltools::tags$script(src = "https://code.jquery.com/jquery-3.5.1.js")
htmltools::tags$script(src = "https://cdn.datatables.net/1.10.22/js/jquery.dataTables.min.js")

<style type="text/css">
@import url("https://cdn.datatables.net/1.10.22/css/jquery.dataTables.min.css");
</style>
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
/* Custom filtering function which will search data in column four between two values */
$.fn.dataTable.ext.search.push(
	function( settings, data, dataIndex ) {
		var TlogFC = parseFloat( $('#TlogFC').val(), 1 );
		var Tpadj = parseFloat( $('#Tpadj').val(), 0.5 );
		var logFC = Math.abs(parseFloat( data[3] )) || 0; // use data for the age column
    var padj = parseFloat( data[6] ) || 1; // use data for the age column

		if ( ( isNaN( TlogFC ) && isNaN( Tpadj ) ) ||
			 ( isNaN( TlogFC ) && padj <= Tpadj ) ||
			 ( TlogFC <= logFC   && isNaN( Tpadj ) ) ||
			 ( TlogFC <= logFC   && padj <= Tpadj ) )
		{
			return true;
		}
		return false;
	}
);

$(document).ready(function() {
    var table = $('#mytable').DataTable();
    // Event listener to the two range filtering inputs to redraw on input
    $('#TlogFC, #Tpadj').keyup(function() {
        table.draw();
    } );
} );
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

options(StringAsFactor = F)
options("digits" = 3)
options(pillar.sigfig=3)
library(ggpubr)
library(pheatmap)
library(DESeq2)
library(apeglm)
library(tidyverse)
library(rtracklayer)
library(dplyr)
library(clusterProfiler)
library(org.Hs.eg.db)
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
ndds<-read_rds(snakemake@input[['deseq2_rds']])
res<-read_tsv(snakemake@input[['deseq2_res']])
exp<-read_tsv(snakemake@input[['deseq2_exp']])

logFC_threshold <- snakemake@params[['logFC_threshold']]
padj_threshold <- snakemake@params[['padj_threshold']]


#输出
outdir<-snakemake@params[['outdir']]
Name <-snakemake@params[['Name']]

if (!file.exists(outdir)){
  dir.create(outdir)} 
outdir<-paste0(outdir,'/')
DEG_res=paste0(outdir,Name,'.DEG_res.tsv')
DEG_exp_file=paste0(outdir,Name,'.DEG_exp.tsv')
ma_fig=paste0(outdir,Name,'.maplot.svg')
volcano_fig=paste0(outdir,Name,'.volcano.svg')
heatmap_fig=paste0(outdir,Name,'.heatmap.svg')
pca_fig=paste0(outdir,Name,'.pca.svg')
go_fig1=paste0(outdir,Name,'.GO.barplot.svg')
go_fig2=paste0(outdir,Name,'.GO.dotplot.svg')
121
122
123
124
125
126
127
128
129
if (length(resultsNames(ndds))>2){
    test<-'LRT'
}else{
    test<-'Wald'
}

meta<-colData(ndds)%>%
    as.data.frame()
knitr::kable(meta,caption='样品信息')
138
139
140
141
#res%>%
#  drop_na()%>%
#  mutate_if(is.numeric, ~round(., 3)) %>%
#  DT::datatable(filter = 'top', options = list(autoWidth = TRUE))
155
156
157
158
159
160
res%>%
  arrange(desc(abs(log2FoldChange))) %>%
  drop_na() %>%
  dplyr::filter(padj<padj_threshold,abs(log2FoldChange)>logFC_threshold)%>%
  mutate_if(is.numeric, ~round(., 3)) %>%
  DT::datatable(filter = 'top', options = list(autoWidth = TRUE))
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#筛选结果
if (test=='Wald'){
DEG<-res%>%
  dplyr::filter(padj<padj_threshold,abs(log2FoldChange)>logFC_threshold)%>%
  arrange(padj)%>%
  as.data.frame()

} else{
#LRT
deggene<-res%>%
  dplyr::filter(padj<padj_threshold,abs(log2FoldChange)>logFC_threshold)%>%
  arrange(padj)%>%
  distinct(gene)
DEG=res[res$gene%in%deggene$gene,]
}

DEG_exp=as.data.frame(exp[exp$gene%in%DEG$gene,])

write_tsv(DEG,DEG_res)
write_tsv(DEG_exp,DEG_exp_file)
198
199
200
201
202
203
204
205
206
207
208
209
210
quant<-exp%>%
    dplyr::select(-gene,-gene_name)%>%
    as.data.frame()
p2<- prcomp(t(quant))
pca_data <- predict(p2)
pdata=pca_data[,c('PC1','PC2','PC3')]%>%
    as.data.frame()%>%
    cbind(meta)
fig<-ggscatter(pdata, x = "PC1", y = "PC2",color = "Condition",size =4,shape="Group",
    palette = 'jco',ellipse = F, mean.point = F,star.plot = F,label='Sample',
    ggtheme = ggplot2::theme_minimal())
#ggsave(pca_fig,plot=fig)
fig
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
df<-res%>%
  mutate(log10padj =-log10(res$padj))

fc_threshold=2**logFC_threshold
fig<-ggmaplot(df,fdr = padj_threshold , fc = fc_threshold , #差异阈值的设定
         size = 0.6, #点的大小
         palette = c("#B31B21", "#1465AC", "darkgray"),
         genenames = as.vector(df$gene_name),
         xlab = "A",ylab = "M",
         legend = "top", 
         top = 10, #选择展示的top基因数目
         font.label = c("bold", 10),label.rectangle = TRUE,
         font.legend = "bold",select.top.method = "fc",
         font.main = "bold",
         ggtheme = ggplot2::theme_minimal()
         )
#ggsave(ma_fig,plot=fig)
fig
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
df<-res%>%
    mutate(log10padj =-log10(res$padj))%>%
    arrange(desc(abs(log2FoldChange)))%>%
    drop_na()
df$significant <- 'unchanged' 
df$significant[df$padj < padj_threshold & df$log2FoldChange > logFC_threshold] <-'upregulated'
df$significant[df$padj < padj_threshold & df$log2FoldChange < -logFC_threshold] <-'downregulated' 

xMax <- 10
yMax <- 10

fig<-ggscatter(df, 
          x = "log2FoldChange", 
          y = "log10padj", 
          #ylim=c(0,yMax), xlim=c(-xMax,xMax),
          ylab = "-log10(padj)",
          title = 'volcano plot',
          legend = "right",
          color = "significant",
          size = 0.8,
          label = "gene_name", 
          repel = T,
          show.legend.text = F,
          palette = c("#00AFBB", "#999999", "#FC4E07") ,
          label.select = df$gene_name[1:20])+  # 筛选需要标注的基因
          theme(plot.title = element_text(hjust = 0.5))
#ggsave(volcano_fig,plot=fig)
fig


#save.image()
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
if (dim(DEG)[1]>2){
df<-DEG_exp%>%
  distinct(gene_name,.keep_all=TRUE)%>%
  drop_na()
rownames(df)<-df$gene_name
df<-df%>%
  dplyr::select(-gene,-gene_name)
fig<-pheatmap(df,annotation = meta[,c('Group','Condition')],
        scale="row",   # z-score处理
        color = colorRampPalette(c('blue','white','red'))(50),    # 低、中、高表达的颜色
        cluster_cols = T,   # 样品是否聚类
        cluster_rows = T,
        show_colnames = T,
        show_rownames = F,
        fontsize = 8,       # 字体大小     
        fontsize_row = 8, fontsize_col = 6,main='Gene count heatmap')
#ggsave(heatmap_fig,plot=fig)
fig}else{
  print("差异基因过少")
}
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
if (test=='LRT' && dim(DEG)[1]>0){

df<-DEG%>%
    distinct(gene_name,contrast,.keep_all=TRUE)%>%
    dplyr::select(gene_name,log2FoldChange,contrast)%>%
    distinct(gene_name,contrast,.keep_all=TRUE)%>%
    pivot_wider(names_from='contrast',values_from ='log2FoldChange')%>%
    #pivot_wider(names_from='contrast',values_from ='log2FoldChange',values_fill = list(log2FoldChange = 0))%>%
    as.data.frame()%>%
    drop_na(gene_name)
rownames(df)=df$gene_name
df=df[,-1]

paletteLength <- 50
myBreaks <- c(seq(min(df), 0, length.out=ceiling(paletteLength/2) + 1), 
              seq(max(df)/paletteLength, max(df), length.out=floor(paletteLength/2)))
fig<-pheatmap(df,
        border_color=F,
        color = colorRampPalette(c('blue','white','red'))(50),    # 低、中、高表达的颜色
        scale='row',
        cluster_cols = F,   # 样品是否聚类
        cluster_rows = T,
        show_colnames = T,
        show_rownames = F,
        fontsize = 8,       # 字体大小     
        fontsize_row = 8, fontsize_col = 10,
        main='Fold Change(log2) heatmap')
fig
}
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
if (dim(DEG)[1]>0){
deggene<-DEG%>%
  drop_na()
ego_ALL <- enrichGO(gene          = deggene$gene_name,
                   OrgDb         = 'org.Hs.eg.db',
                   keyType       = 'SYMBOL',
                   ont           = "ALL",
                   pAdjustMethod = "BH",
                   pvalueCutoff  = 0.1,
                   qvalueCutoff  = 0.1)

if(!is.null(ego_ALL)){
fig<-barplot(ego_ALL,showCategory=30,font.size=10)
#ggsave(go_fig1,plot=fig)
fig}
}
372
373
374
375
376
377
378
379
if (dim(DEG)[1]>0){
if(!is.null(ego_ALL)){
fig=dotplot(ego_ALL,showCategory=30,font.size=10)
#ggsave(go_fig2,plot=fig)
fig}else{
  print('无显著结果')
}
}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library(tximport)
library(readr)
library(stringr)
library(dplyr)
library(tidyverse)

files <- snakemake@input[['quant_files']]
names(files)<- basename(dirname(files))

gtf<-snakemake@params[['gtf']]
gtfinfo<-rtracklayer::import(gtf)

tx2gene <- gtfinfo%>%
  tibble::as_tibble() %>% 
  dplyr::select(c(transcript_id,gene_id))%>%
  drop_na()%>%
  distinct()
colnames(tx2gene)<-c("TXNAME",'GENEID')

if (!all(file.exists(files))){
  print(files[!file.exists(files)])
}

txi <- tximport(files, type="salmon", tx2gene=tx2gene,ignoreTxVersion=T)

out<-as.data.frame(txi['abundance'])
colnames(out)<-str_replace(colnames(out),'abundance.','')
out=out[rowSums(out<0.1)<(dim(out)[2]*0.8),]

write.table(out,snakemake@output[['quant_matrix']],sep='\t',quote=F)
ShowHide 19 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/xiangyugits/rnaseq-salmon-deseq2
Name: rnaseq-salmon-deseq2
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: None
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...