Bulk RNA-seq Analysis Pipeline for Differentially Expressed Genes (DEGs)

public public 1yr ago Version: 2 0 bookmarks

Simple workflow to quantify gene-level RNA abundance and detect differentially expressed genes (DEGs) from bulk RNAseq samples. The pipeline uses kallisto or salmon to quantify transcript level abundance and DESeq2

Code Snippets

 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from os import path

from snakemake.shell import shell


input_dirs = set(path.dirname(fp) for fp in snakemake.input)
output_dir = path.dirname(snakemake.output[0])
output_name = path.basename(snakemake.output[0])
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

shell(
    "multiqc"
    " {snakemake.params}"
    " --force"
    " -o {output_dir}"
    " -n {output_name}"
    " {input_dirs}"
    " {log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

# Handle readr tzdata bug when using singularity mambaforge container
Sys.setenv("TZDIR"=paste0(Sys.getenv("CONDA_PREFIX"), "/share/zoneinfo"))

library(tximeta)
library(DESeq2)
library(readr)
library(stringr)
library(org.Hs.eg.db)

res_dirs <- snakemake@input[['cts']]
samples_fp <- snakemake@input[['samples']]
quant_program <- snakemake@params[['aligner']]

design_formula <- snakemake@params[['formula']]

if (snakemake@threads > 1) {
    library("BiocParallel")
    parallel <- TRUE
    register(MulticoreParam(snakemake@threads))
} else {
    parallel <- FALSE
}

if (quant_program == 'salmon') {
    files <- file.path(res_dirs, "quant.sf")
} else{
    print("quant_program wasn't Salmon!")
    stop()
}

names(files) <- basename(dirname(files))

samples <- read.csv(samples_fp)
samples$id <- paste(samples$patient, "-", samples$condition, sep = "")
print("First")
print(samples)
# Reorder rows so they match files order
samples <- samples[match(names(files), samples$id),]
print("Second")
print(files)
print(samples)

# Check that matching worked
samples$names <- names(files)
samples$files <- files
stopifnot(all(samples$id == samples$names))
stopifnot(all(samples$names == basename(dirname(samples$files))))
print(samples)

# Save SummarizedExperiment with tx-level data for future use
se <- tximeta(samples)
se <- addIds(se, "SYMBOL", gene = T)
saveRDS(se, snakemake@output[['se_tx']])

# Collapse to gene-level for DE analysis
gse <- summarizeToGene(se)
gse <- addIds(gse, "SYMBOL", gene = T)
f <- as.formula(design_formula)

# No longer need se object - remove from memory
rm(se); gc()

## Gene-level
print("Building DESeq object for gene-level features")
dds <- DESeqDataSet(gse, design = f)
print("Old ordering for condition")
print(colData(dds)$condition)

## Ensure factor ordering based on config specifications
vars <- snakemake@params[['levels']]
var_levels <- str_split(vars, ';', simplify=T)
for (var in var_levels) {
    print(paste("Variable:", var))
    s <- str_split(var, '=|,', simplify=T)
    col <- s[1, 1]
    level_order = s[1, 2:dim(s)[2]]
    colData(dds)[, col] <- factor(colData(dds)[, col], level_order)
    print(paste("Ordering for", col))
    print(levels(colData(dds)[, col]))
}

dds <- DESeq(dds, parallel=parallel)
print(dds)

vst_cts <- vst(dds, blind=FALSE)
print(vst_cts)

saveRDS(dds, file=snakemake@output[['deseq']])
saveRDS(vst_cts, file=snakemake@output[['cts']])
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

# Handle readr tzdata bug when using singularity mambaforge container
Sys.setenv("TZDIR"=paste0(Sys.getenv("CONDA_PREFIX"), "/share/zoneinfo"))

library(DESeq2)
library(IHW)
library(ggplot2)
library(dplyr)
library(stringr)

if (snakemake@threads > 1) {
    library("BiocParallel")
    parallel <- TRUE
    register(MulticoreParam(snakemake@threads))
} else {
    parallel <- FALSE
}

dds <- readRDS(snakemake@input[[1]])



print(dds)

# Grab last variable at end of formula for contrasts
design_formula <- snakemake@params[['formula']]
s <- str_remove_all(design_formula, " |~")
s <- str_split(s, "\\+")
vars <- s[[1]]
var <- vars[length(vars)]

print(snakemake@params[['contrast']])
print("Creating results for the following variable:")
print(var)

print(resultsNames(dds))

contrast_coef <- paste(c(var, snakemake@params[['contrast']][1], "vs", snakemake@params[['contrast']][2]), collapse="_")
de_contrast <- c(var, snakemake@params[['contrast']][1], snakemake@params[['contrast']][2])

mle_res <- results(dds, contrast=de_contrast, filterFun=ihw, alpha = .05, parallel = parallel)
map_res <- lfcShrink(dds, coef=contrast_coef, type = "apeglm", parallel = parallel)

# Add gene symbols from rowData
print("res and dds ensgenes match?")
stopifnot(all(rownames(mle_res) == rownames(rowData(dds))))
stopifnot(all(rownames(map_res) == rownames(rowData(dds))))
print("Ensgenes match. Adding symbols to results dataframes")
mle_res$symbol <- rowData(dds)$SYMBOL
map_res$symbol <- rowData(dds)$SYMBOL

print("MLE LFC:")
print(mle_res)
print(summary(mle_res))

print("MAP LFC:")
print(map_res)
print(summary(map_res))

mle_df <- mle_res %>%
  data.frame() %>%
  tibble::rownames_to_column(var = "ensgene") %>%
  as_tibble() %>%
  dplyr::select(symbol, ensgene, everything()) %>%
  dplyr::arrange(padj)

map_df <- map_res %>%
    data.frame() %>%
    tibble::rownames_to_column(var = "ensgene") %>%
    as_tibble() %>%
    dplyr::select(symbol, ensgene, everything()) %>%
    dplyr::arrange(padj)

print("MLE dataframe")
print(mle_df)
print("MAP dataframe")
print(map_df)


mleplot <- mle_df %>%
    dplyr::mutate(
        significant = ifelse(padj < .05, "padj < .05", "padj >= .05"),
        direction = ifelse(log2FoldChange > 0, "Upregulated", "Downregulated")
    ) %>%
    ggplot(aes(log10(baseMean),log2FoldChange)) +
    geom_point(aes(color = significant, shape = direction)) +
    geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
    labs(x = "log10 Expression", y = "MLE Log2FoldChange") +
    theme_bw()

mapplot <- map_df %>%
    dplyr::mutate(
        significant = ifelse(padj < .05, "padj < .05", "padj >= .05"),
        direction = ifelse(log2FoldChange > 0, "Upregulated", "Downregulated")
    ) %>%
    ggplot(aes(log10(baseMean),log2FoldChange)) +
    geom_point(aes(color = significant, shape = direction)) +
    geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
    labs(x = "log10 Expression", y = "MAP Log2FoldChange") +
    theme_bw()

readr::write_csv(mle_df, snakemake@output[['mleres']])
readr::write_csv(map_df, snakemake@output[['mapres']])
ggsave(snakemake@output[['mlema']], plot = mleplot, width = 10, height = 7)
ggsave(snakemake@output[['mapma']], plot = mapplot, width = 10, height = 7)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library(DESeq2)
library(ggplot2)
library(stringr)

vsd <- readRDS(snakemake@input[[1]])

print(vsd)

label_vars <- str_split(snakemake@params[['label_vars']], ',', simplify=T)[1, ]

pcaplot <- plotPCA(vsd, intgroup=label_vars)

ggsave(snakemake@output[[1]], pcaplot)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import os
import pandas as pd




def check_validation(fp):
    sampleid = os.path.basename(fp)
    sampleid = sampleid.replace(".txt", "")
    with open(fp, 'r') as f:
        line = f.readline()
        while line:
            if 'Error found' in line:
                return sampleid, 'Failed'
            elif 'no errors found' in line:
                return sampleid, 'Passed'
            line = f.readline()
        return sampleid, 'No log data found in file'

def main():
    files = snakemake.input
    results = [check_validation(f) for f in files]
    df = pd.DataFrame.from_dict({'patient' : [t[0] for t in results], 
        'result' : [t[1] for t in results]})
    print(df)
    df.to_csv(snakemake.output[0], index = False)



if __name__ == '__main__':
    main()
158
159
160
161
shell:
    """
    salmon quant -i {input.idx} -l A {params.fqs} -p {threads} -o {output} {params.reps}
    """
179
180
script:
    "scripts/deseq2.R"
197
198
script:
    "scripts/diffexp.R"
SnakeMake From line 197 of main/Snakefile
208
209
wrapper:
    "0.50.4/bio/multiqc"
221
222
script:
    "scripts/plot_pca.R"
SnakeMake From line 221 of main/Snakefile
232
233
234
235
236
237
238
239
shell:
    """
    tmpdir=qc/fastqc/.{wildcards.sample_id}.tmp
    mkdir $tmpdir
    mkdir {output}
    fastqc {input} -o {output} &> >(tee {log}) -d $tmpdir
    rm -r $tmpdir
    """
250
251
252
253
shell:
    """
    biopet-validatefastq {params} 2>&1 | tee {output}
    """
260
261
script:
    "scripts/summarize_fastqValidation.py"
SnakeMake From line 260 of main/Snakefile
ShowHide 4 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/sveizades/bulk-rnaseq
Name: bulk-rnaseq
Version: 2
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Other Versions:
Downloaded: 0
Copyright: Public Domain
License: None
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...