A workflow for RNA-seq analyses in Snakemake

public public 1yr ago Version: 5 0 bookmarks

A workflow for RNA-seq analysis in Snakemake.

Table of Contents

Introduction

This workflow is a bioinformatics analysis pipeline for RNA sequencing data. The workflow is built using Snakemake - a scalabale bioinformatics workflow engine

Requirements

This workflow requires the following software to run:

  • [Snakemake][snakemake]

  • [Conda][code]

Usage

Clone workflow into working directory:

git clone https://github.com/jma1991/rnaseq.git

Execute workflow and deploy software dependencies via conda:

snakemake --use-conda

Configuration

Configure the workflow by editing the files in the config directory:

  • config.yaml is a YAML file containing the workflow metadata.

  • samples.csv is a CSV file containing the sample metadata.

  • units.csv is a CSV file contains the unit metadata.

Contributing

To contribute to the workflow, clone this repository locally and commit your code on a separate branch. Please generate unit tests for your code, and run the linter before opening a pull-request:

snakemake --generate-unit-tests # generate unit tests
snakemake --lint # run the linter

You can find more detail in our Contributing Guide . Participation in this open source project is subject to a Code of Conduct .

Thanks

I would like to thank Johannes Köster for developing the Snakemake workflow engine and Istvan Albert for writing the biostar handbook.

License

This workflow is licensed under the MIT license.
Copyright © 2020, James Ashmore

Code Snippets

21
22
shell:
    "bamCoverage --bam {input.bam} --outFileName {output.wig} --numberOfProcessors {threads} --normalizeUsing RPKM --skipNonCoveredRegions 1> {log.out} 2> {log.err}"
18
19
script:
    "../scripts/deseq2.R"
33
34
script:
    "../scripts/counts.R"
48
49
script:
    "../scripts/normcounts.R"
63
64
script:
    "../scripts/logcounts.R"
79
80
script:
    "../scripts/results.R"
44
45
script:
    "../scripts/analyzeDuprates.R"
57
58
script:
    "../scripts/duprateExpDensPlot.R"
22
23
shell:
    "fastqc -o {params.out} -t {threads} {input.fqz} 1> {log.out} 2> {log.err}"
22
23
shell:
    "genomepy install -g results/genomepy -r 'chrX' -a {wildcards.genome} 1> {log.out} 2> {log.err}"
36
37
shell:
    "gunzip -c {input} > {output}"
20
21
shell:
    "kallisto index -i {output.idx} {input.fas} 1> {log.out} 2> {log.err}"
39
40
shell:
    "kallisto quant -i {input.idx} -o {params.out} -t {threads} {params.arg} {input.fqz} 1> {log.out} 2> {log.err}"
18
19
script:
    "../scripts/limma.R"
33
34
script:
    "../scripts/voom.R"
48
49
script:
    "../scripts/counts.R"
63
64
script:
    "../scripts/normcounts.R"
78
79
script:
    "../scripts/logcounts.R"
94
95
script:
    "../scripts/toptable.R"
18
19
script:
    "../scripts/dist.R"
33
34
script:
    "../scripts/prcomp.R"
48
49
script:
    "../scripts/cmdscale.R"
63
64
script:
    "../scripts/diff.R"
78
79
script:
    "../scripts/pvalue.R"
93
94
script:
    "../scripts/volcano.R"
108
109
script:
    "../scripts/heatmap.R"
SnakeMake From line 108 of rules/plots.smk
17
18
shell:
    "bam_stat.py -i {input} 1> {output} 2> {log}"
35
36
shell:
    "inner_distance.py -i {input.bam} -r {input.bed} -o {params.out} 1> {log.out} 2> {log.err}"
50
51
shell:
    "infer_experiment.py -i {input.bam} -r {input.bed} 1> {output.txt} 2> {log.err}"
67
68
shell:
    "junction_annotation.py -i {input.bam} -r {input.bed} -o {params.out} 2> {log.err}"
84
85
shell:
    "junction_saturation.py -i {input.bam} -r {input.bed} -o {params.out} 2> {log.err}"
 99
100
shell:
    "read_distribution.py -i {input.bam} -r {input.bed} 1> {output.txt} 2> {log.err}"
115
116
shell:
    "read_duplication.py -i {input.bam} -o {params.out} 2> {log.err}"
SnakeMake From line 115 of rules/rseqc.smk
134
135
shell:
    "geneBody_coverage.py -i {input.bam} -r {input.bed} -o {params.out} 2> {log.err}"
SnakeMake From line 134 of rules/rseqc.smk
150
151
shell:
    "read_GC.py -i {input.bam} -o {params.out} 2> {log.err}"
SnakeMake From line 150 of rules/rseqc.smk
21
22
shell:
    "STAR --runMode genomeGenerate --runThreadN {threads} --genomeDir {output.dir} --genomeFastaFiles {input.fas} --sjdbGTFfile {input.gtf} 1> {log.out} 2> {log.err}"
42
43
shell:
    "STAR --runMode alignReads --runThreadN {threads} --genomeDir {input.idx} --readFilesIn {params.fqz} --readFilesCommand gunzip -c --outFileNamePrefix {params.out} --outSAMtype BAM SortedByCoordinate --outTmpDir /tmp/TMPDIR/{wildcards.sample} 1> {log.out} 2> {log.err}"
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
main <- function(input, output, params, log, threads) {

    # Log function

    out <- file(log$out, open = "wt")

    err <- file(log$err, open = "wt")

    sink(out, type = "output")

    sink(err, type = "message")

    # Script function

    library(dupRadar)

    mat <- analyzeDuprates(
        bam = input$bam,
        gtf = input$gtf,
        stranded = params$stranded,
        paired = params$paired,
        threads = threads
    )

    write.csv(mat, file = output$csv)

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log, snakemake@threads)
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
calculateMDS <- function(x) {

    # Perform multi-dimensional scaling

    d <- dist(t(x))

    d <- cmdscale(d)

    d <- as.data.frame(d)

    colnames(d) <- c("MDS.1", "MDS.2")

    return(d)

}

main <- function(input, output, log) {

    # Log function

    out <- file(log$out, open = "wt")

    err <- file(log$err, open = "wt")

    sink(out, type = "output")

    sink(err, type = "message")

    # Script function

    library(ggplot2)

    dat <- read.csv("config/samples.csv", stringsAsFactors = FALSE)

    mat <- read.csv(input$csv, row.names = 1)

    mds <- calculateMDS(mat)

    dat <- merge(dat, mds, by.x = "sample", by.y = "row.names")

    plt <- ggplot(dat, aes(MDS.1, MDS.2, colour = condition)) + 
        geom_point() + 
        labs(x = "MDS 1", y = "MDS 2") + 
        theme_bw() + 
        theme(
            aspect.ratio = 1,
            axis.title.x = element_text(margin = unit(c(1, 0, 0, 0), "lines")),
            axis.title.y = element_text(margin = unit(c(0, 1, 0, 0), "lines"))
        )

    ggsave(output$pdf, plot = plt, width = 7, height = 7)

}

main(snakemake@input, snakemake@output, snakemake@log)
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
counts <- function(object) {
    # Return raw counts
    UseMethod("counts")
}

counts.DESeqDataSet <- function(object) {
    # DESeq2 method
    return(DESeq2::counts(object))
}

counts.DGEList <- function(object) {
    # edgeR method
    return(object$counts)

}

main <- function(input, output, log) {

    # Log function

    out <- file(log$out, open = "wt")

    err <- file(log$err, open = "wt")

    sink(out, type = "output")

    sink(err, type = "message")

    # Script function

    obj <- readRDS(input$rds)

    mat <- counts(object = obj)

    write.csv(mat, file = output$csv)

}

main(snakemake@input, snakemake@output, snakemake@log)
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
DESeqDataSet <- function(object, ...) {
    # Dispatch method
    UseMethod("DESeqDataSet")
}

DESeqDataSet.tximport <- function(object, colData, design) {
    # Create DESeqDataSet object from tximport
    DESeqDataSetFromTximport(txi = object, colData = colData, design = design)
}

DESeqDataSet.rsubread <- function(object, colData, design) {
    # Create DESeqDataSet object from rsubread
    DESeqDataSetFromMatrix(countData = object$counts, colData = colData, design = design)   
}

main <- function(input, output, log, config) {

    # Log function

    out <- file(log$out, open = "wt")

    err <- file(log$err, open = "wt")

    sink(out, type = "output")

    sink(err, type = "message")

    # Script function

    library(DESeq2)

    obj <- readRDS(input$rds)

    dat <- read.csv("config/samples.csv", row.names = "sample", stringsAsFactors = FALSE)

    dds <- DESeqDataSet(object = obj, colData = dat, design = ~ condition)

    dds <- DESeq(dds)

    saveRDS(dds, file = output$rds)

}

main(snakemake@input, snakemake@output, snakemake@log, snakemake@config)
R From line 6 of scripts/deseq2.R
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
add.status <- function(x, fdr.threshold = 0.05) {

    # Determine the status of each gene in the results table

    x$status <- factor("NS", levels = c("Up", "NS", "Down"))

    x$status[x$log2FoldChange > 0 & x$FDR < fdr.threshold] <- "Up"

    x$status[x$log2FoldChange < 0 & x$FDR < fdr.threshold] <- "Down"

    return(x)

}

add.symbol <- function(x, n = 50) {

    # Show symbol for the top N genes in the results table

    y <- subset(x, status %in% c("Up", "Down"))

    r <- rank(-y$baseMean) + rank(-abs(y$log2FoldChange)) + rank(y$PValue)

    n <- min(n, nrow(y))

    i <- sort(r, decreasing = FALSE, index.return = TRUE)$ix[seq_len(n)]

    i <- x$geneId %in% y$geneId[i]

    x$symbol <- ""

    x$symbol[i] <- x$geneName[i]

    return(x)

}

shrink.log2FoldChange <- function(x, lfc = 3) {

    # Shrink absolute log2FoldChange values greater than threshold

    x$log2FoldChange[x$log2FoldChange > lfc] <- lfc

    x$log2FoldChange[x$log2FoldChange < -lfc] <- -lfc

    return(x)

}

main <- function(input, output, log ) {

    # Log function

    out <- file(log$out, open = "wt")

    err <- file(log$err, open = "wt")

    sink(out, type = "output")

    sink(err, type = "message")

    # Script function

    library(ggplot2)

    library(ggrepel)

    library(scales)

    res <- read.csv(input$csv)

    res <- add.status(res, fdr.threshold = 0.05)

    res <- add.symbol(res, n = 50)

    res <- shrink.log2FoldChange(res, lfc = 3)

    res <- res[order(res$PValue, decreasing = TRUE), ]

    col <- c("Up" = "#d8b365", "NS" = "#f5f5f5", "Down" = "#5ab4ac")

    lab <- c(
        "Up" = sprintf("Up (%s)", comma(sum(res$status == "Up"))),
        "NS" = sprintf("NS (%s)", comma(sum(res$status == "NS"))),
        "Down" = sprintf("Down (%s)", comma(sum(res$status == "Down")))
    )

    plt <- ggplot(res, aes(log10(baseMean), log2FoldChange, colour = status, label = symbol)) + 
        geom_point() +
        scale_colour_manual(values = col, labels = lab) +
        geom_text_repel(size = 1.88, colour = "#000000", show.legend = FALSE, max.overlaps = Inf) +  
        labs(x = "Normalized mean (log10)", y = "Fold change (log2)", colour = "Status") + 
        theme_bw() + 
        theme(
            aspect.ratio = 1,
            axis.title.x = element_text(margin = unit(c(1, 0, 0, 0), "lines")),
            axis.title.y = element_text(margin = unit(c(0, 1, 0, 0), "lines"))
        )

    ggsave(output$pdf, plot = plt, width = 7, height = 7)

}

main(snakemake@input, snakemake@output, snakemake@log)
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
pheatmap.mat <- function(x) {

    # Return distance matrix

    as.matrix(dist(t(x), method = "euclidean"))

}

pheatmap.color <- function(x) {

    # Return color vector

    colorRampPalette(rev(RColorBrewer::brewer.pal(n = 5, name = x)))(100)

}

pheatmap.breaks <- function(x) {

    # Return breaks vector

    seq(0, max(x), length.out = 101)

}

pheatmap.cluster_rows <- function(x) {

    # Return hclust object for rows

    hclust(as.dist(x), method = "complete")

}

pheatmap.cluster_cols <- function(x) {

    # Return hclust object for columns

    hclust(as.dist(x), method = "complete")

}

main <- function(input, output, log) {

    # Log function

    out <- file(log$out, open = "wt")

    err <- file(log$err, open = "wt")

    sink(out, type = "output")

    sink(err, type = "message")

    # Script function

    library(pheatmap)

    library(RColorBrewer)

    dat <- read.csv(input$csv, row.names = 1)

    mat <- pheatmap.mat(dat)

    pheatmap(
        mat = mat,
        color = pheatmap.color("Blues"),
        breaks = pheatmap.breaks(mat),
        cluster_rows = pheatmap.cluster_rows(mat),
        cluster_cols = pheatmap.cluster_cols(mat),
        filename = output$pdf,
        width = 7, 
        height = 7
    )

}

main(snakemake@input, snakemake@output, snakemake@log)
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
main <- function(input, output, log) {

    # Log function

    out <- file(log$out, open = "wt")

    err <- file(log$err, open = "wt")

    sink(out, type = "output")

    sink(err, type = "message")

    # Script function

    library(dupRadar)

    mat <- read.csv(input$csv)

    pdf(output$pdf)

    duprateExpDensPlot(DupMat = mat)

    dev.off()

}

main(snakemake@input, snakemake@output, snakemake@log)
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
selectCPM <- function(x) {

    id1 <- which(colnames(x) == "falsePos") + 1

    id2 <- ncol(x)

    idx <- seq(from = id1, to = id2)

    cpm <- x[, idx]

    cpm <- as.matrix(cpm)

    return(cpm)

}

pheatmap.mat <- function(x) {

    # Scale rows by 'variance-aware' Z-transformation

    M <- rowMeans(x, na.rm = TRUE)

    DF <- ncol(x) - 1

    isNA <- is.na(x)

    anyNA <- any(isNA)

    if (anyNA) {

        mode(isNA) <- "integer"

        DF <-  DF - rowSums(isNA)

        DF[DF == 0] <- 1

    }

    x <- x - M

    V <- rowSums(x^2, na.rm = TRUE) / DF

    x <- x / sqrt(V + 0.01)

}

pheatmap.color <- function(x) {

    # Return color vector

    colorRampPalette(rev(RColorBrewer::brewer.pal(n = 5, name = x)))(100)

}

pheatmap.breaks <- function(x) {

    # Return breaks vector

    abs <- max(abs(x))

    abs <- min(abs, 5)

    seq(-abs, +abs, length.out = 101)

}

pheatmap.cluster_rows <- function(x) {

    # Return hclust object for rows

    hclust(dist(x, method = "euclidean"), method = "complete")

}

pheatmap.cluster_cols <- function(x) {

    # Return hclust object for columns

    hclust(dist(t(x), method = "euclidean"), method = "complete")

}

main <- function(input, output, log) {

    # Log function

    out <- file(log$out, open = "wt")

    err <- file(log$err, open = "wt")

    sink(out, type = "output")

    sink(err, type = "message")

    # Script function

    library(pheatmap)

    res <- read.csv(input$csv, row.names = 1)

    ind <- order(res$PValue)[seq_len(50)]

    res <- res[ind, ]

    cpm <- selectCPM(res)

    mat <- pheatmap.mat(cpm)

    pheatmap(
        mat = mat, 
        color = pheatmap.color("RdBu"), 
        breaks = pheatmap.breaks(mat), 
        cluster_rows = pheatmap.cluster_rows(mat), 
        cluster_cols = pheatmap.cluster_cols(cpm),
        labels_row = res$geneName,
        filename = output$pdf, 
        width = 6, 
        height = 8
    )

}

main(snakemake@input, snakemake@output, snakemake@log)
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
DGEList <- function(object, ...) {
    # Dispatch method
    UseMethod("DGEList")
}

DGEList.rsubread <- function(object, samples, group) {
    # Rsubread method
    edgeR::DGEList(counts = object$counts, samples = samples, group = group)
}

DGEList.tximport <- function(object, samples, group) {
    # tximport method
    counts <- tximport::makeCountsFromAbundance(object$counts, object$abundance, object$length, countsFromAbundance = "lengthScaledTPM")
    edgeR::DGEList(counts = counts, samples = samples, group = group)
}

main <- function(input, output, log) {

    # Log function

    out <- file(log$out, open = "wt")

    err <- file(log$err, open = "wt")

    sink(out, type = "output")

    sink(err, type = "message")

    # Script function

    library(edgeR)

    library(limma)

    obj <- readRDS(input$rds)

    dat <- read.csv("config/samples.csv", row.names = "sample", stringsAsFactors = FALSE)

    dge <- DGEList(object = obj, samples = dat, group = dat$condition)

    ind <- filterByExpr(dge, group = dat$condition)

    dge <- dge[ind, , keep.lib.sizes = FALSE]

    dge <- calcNormFactors(dge)

    saveRDS(dge, file = output$rds)

}

main(snakemake@input, snakemake@output, snakemake@log)
R From line 6 of scripts/limma.R
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
logcounts <- function(object) {
    # Return log-transformed counts
    UseMethod("logcounts")
}

logcounts.DESeqDataSet <- function(object) {
    # DESeq2 method
    rld <- DESeq2::rlog(object, blind = FALSE)
    return(SummarizedExperiment::assay(rld))
}

logcounts.DGEList <- function(object) {
    # edgeR method
    return(edgeR::cpm(object, log = TRUE))
}

logcounts.EList <- function(object) {
    # limma method
    return(object$E)
}

main <- function(input, output, log) {

    # Log function

    out <- file(log$out, open = "wt")

    err <- file(log$err, open = "wt")

    sink(out, type = "output")

    sink(err, type = "message")

    # Script function

    obj <- readRDS(input$rds)

    mat <- logcounts(object = obj)

    write.csv(mat, file = output$csv)

}

main(snakemake@input, snakemake@output, snakemake@log)
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
normcounts <- function(object) {
    # Return normalized counts
    UseMethod("normcounts")
}

normcounts.DESeqDataSet <- function(object) {
    # DESeq2 method
    return(DESeq2::counts(object, normalized = TRUE))
}

normcounts.DGEList <- function(object) {
    # edgeR method
    return(edgeR::cpm(object))
}

normcounts.EList <- function(object) {
    # limma method
    return(2 ^ object$E)
}

main <- function(input, output, log) {

    # Log function

    out <- file(log$out, open = "wt")

    err <- file(log$err, open = "wt")

    sink(out, type = "output")

    sink(err, type = "message")

    # Script function

    obj <- readRDS(input$rds)

    mat <- normcounts(object = obj)

    write.csv(mat, file = output$csv)

}

main(snakemake@input, snakemake@output, snakemake@log)
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
calculatePCA <- function(x, n = 500) {

    var <- apply(x, 1, var)

    num <- min(length(var), n)

    sel <- order(var, decreasing = TRUE)[seq_len(num)]

    sub <- x[sel, ]

    pca <- prcomp(t(sub))

    dev <- pca$sdev^2 / sum(pca$sdev^2)

    dat <- data.frame("PCA.1" = pca$x[, 1], "PCA.2" = pca$x[, 2], row.names = colnames(sub))

    attr(dat, "percentVar") <- dev[1:2]

    return(dat)

}

main <- function(input, output, log) {

    # Log function

    out <- file(log$out, open = "wt")

    err <- file(log$err, open = "wt")

    sink(out, type = "output")

    sink(err, type = "message")

    # Script function

    library(ggplot2)

    dat <- read.csv("config/samples.csv", stringsAsFactors = FALSE)

    mat <- read.csv(input$csv, row.names = 1)

    pca <- calculatePCA(mat, n = 500)

    dat <- merge(dat, pca, by.x = "sample", by.y = "row.names")

    var <- attr(pca, "percentVar")

    plt <- ggplot(dat, aes(PCA.1, PCA.2, color = condition)) + 
        geom_point() + 
        xlab(paste0("PC1: ", round(var[1] * 100), "% variance")) + 
        ylab(paste0("PC2: ", round(var[2] * 100), "% variance")) +
        coord_fixed() + 
        theme_bw() + 
        theme(
            axis.title.x = element_text(margin = unit(c(1, 0, 0, 0), "lines")),
            axis.title.y = element_text(margin = unit(c(0, 1, 0, 0), "lines"))
        )

    ggsave(output$pdf, plot = plt, width = 7, height = 7)

}

main(snakemake@input, snakemake@output, snakemake@log)
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
main <- function(input, output, log) {

    # Log function

    out <- file(log$out, open = "wt")

    err <- file(log$err, open = "wt")

    sink(out, type = "output")

    sink(err, type = "message")

    # Script function

    library(ggplot2)

    library(magick)

    res <- read.csv(input$csv, row.names = 1)

    plt <- ggplot(res, aes(x = PValue)) + 
        geom_histogram(binwidth = 0.05, colour = "black", fill = "gainsboro", boundary = 0) + 
        labs(x = "P value", y = "Frequency") + 
        theme_bw() + 
        theme(
            aspect.ratio = 1,
            axis.title.x = element_text(margin = unit(c(1, 0, 0, 0), "lines")),
            axis.title.y = element_text(margin = unit(c(0, 1, 0, 0), "lines"))
        )

    ggsave(output$pdf, plot = plt, width = 7, height = 7)

}

main(snakemake@input, snakemake@output, snakemake@log)
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
main <- function(input, output, log, wildcards) {

    # Log function

    out <- file(log$out, open = "wt")

    err <- file(log$err, open = "wt")

    sink(out, type = "output")

    sink(err, type = "message")

    # Script function

    library(DESeq2)

    dds <- readRDS(input$rds)

    ann <- read.delim(input$tsv, header = FALSE, col.names = c("gene_id", "gene_name"))

    con <- c("condition", wildcards$A, wildcards$B)

    res <- results(dds, contrast = con)

    res <- res[order(res$pvalue), ]

    cpm <- counts(dds, normalized = TRUE)[rownames(res), ]

    idx <- list(
        A = which(dds$condition %in% wildcards$A),
        B = which(dds$condition %in% wildcards$B)
    )

    dat <- data.frame(
        geneId         = ann$gene_id[match(rownames(res), ann$gene_id)],
        geneName       = ann$gene_name[match(rownames(res), ann$gene_id)],
        baseMean       = rowMeans(cpm[, c(idx$A, idx$B)]),
        baseMeanA      = rowMeans(cpm[, idx$A]),
        baseMeanB      = rowMeans(cpm[, idx$B]),
        foldChange     = 2 ^ res$log2FoldChange,
        log2FoldChange = res$log2FoldChange,
        PValue         = res$pvalue,
        PAdj           = p.adjust(res$pvalue, method = "hochberg"),
        FDR            = res$padj,
        falsePos       = round(seq_along(res$padj) * res$padj)
    )

    out <- merge(dat, cpm[, c(idx$A, idx$B)], by.x = "geneId", by.y = "row.names", sort = FALSE)

    write.csv(out, file = output$csv, quote = FALSE, row.names = FALSE)

}

main(snakemake@input, snakemake@output, snakemake@log, snakemake@wildcards)
R From line 6 of scripts/results.R
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
main <- function(input, output, log, wildcards) {

    # Log function

    out <- file(log$out, open = "wt")

    err <- file(log$err, open = "wt")

    sink(out, type = "output")

    sink(err, type = "message")

    # Script function

    library(limma)

    obj <- readRDS(input$rds)

    ann <- read.delim(input$tsv, header = FALSE, col.names = c("transcript_id", "gene_id", "gene_name"))

    fit <- lmFit(obj)

    str <- paste(wildcards$A, wildcards$B, sep = "-")

    lvl <- colnames(fit$design)

    con <- makeContrasts(contrasts = str, levels = lvl)

    fit <- contrasts.fit(fit, contrasts = con)

    fit <- eBayes(fit)

    res <- topTable(fit, number = Inf, sort.by = "P")

    cts <- 2 ^ obj$E[rownames(res), ]

    idx <- list(
        A = which(fit$design[, wildcards$A] == 1),
        B = which(fit$design[, wildcards$B] == 1)
    )

    dat <- data.frame(
        geneId         = ann$gene_id[match(rownames(res), ann$gene_id)],
        geneName       = ann$gene_name[match(rownames(res), ann$gene_id)],
        baseMean       = rowMeans(cts[, c(idx$A, idx$B)]),
        baseMeanA      = rowMeans(cts[, idx$A]),
        baseMeanB      = rowMeans(cts[, idx$B]),
        foldChange     = 2 ^ res$logFC,
        log2FoldChange = res$logFC,
        PValue         = res$P.Value,
        PAdj           = p.adjust(res$P.Value, method = "hochberg"),
        FDR            = res$adj.P.Val,
        falsePos       = round(seq_along(res$adj.P.Val) * res$adj.P.Val)
    )

    out <- merge(dat, cts[, c(idx$A, idx$B)], by.x = "geneId", by.y = "row.names", sort = FALSE)

    write.csv(out, file = output$csv, quote = FALSE, row.names = FALSE)

}

main(snakemake@input, snakemake@output, snakemake@log, snakemake@wildcards)
R From line 6 of scripts/toptable.R
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
add.status <- function(x, fdr.threshold = 0.05) {

    # Determine the status of each gene in the results table

    x$status <- factor("NS", levels = c("Up", "NS", "Down"))

    x$status[x$log2FoldChange > 0 & x$FDR < fdr.threshold] <- "Up"

    x$status[x$log2FoldChange < 0 & x$FDR < fdr.threshold] <- "Down"

    return(x)

}

add.symbol <- function(x, n = 50) {

    # Show symbol for the top N genes in the results table

    i <- sort(x$FDR, decreasing = FALSE, index.return = TRUE)$ix[seq_len(n)]

    x$symbol <- ""

    x$symbol[i] <- x$geneName[i]

    return(x)

}

main <- function(input, output, log) {

    # Log function

    out <- file(log$out, open = "wt")

    err <- file(log$err, open = "wt")

    sink(out, type = "output")

    sink(err, type = "message")

    # Script function

    library(ggplot2)

    library(ggrepel)

    library(scales)

    res <- read.csv(input$csv, row.names = 1)

    res <- add.status(res, fdr.threshold = 0.05)

    res <- add.symbol(res, n = 50)

    res <- res[order(res$FDR, decreasing = TRUE), ]

    col <- c("Up" = "#d8b365", "NS" = "#f5f5f5", "Down" = "#5ab4ac")

    lab <- c(
        "Up" = sprintf("Up (%s)", comma(sum(res$status == "Up"))),
        "NS" = sprintf("NS (%s)", comma(sum(res$status == "NS"))),
        "Down" = sprintf("Down (%s)", comma(sum(res$status == "Down")))
    )

    plt <- ggplot(res, aes(log2FoldChange, -log10(PValue), colour = status, label = symbol)) + 
        geom_point() + 
        scale_colour_manual(values = col, labels = lab) +
        geom_text_repel(size = 1.88, colour = "#000000", show.legend = FALSE, max.overlaps = Inf) +
        labs(x = "Fold change (log2)", y = "P value (-log10)", colour = "Status") + 
        theme_bw() + 
        theme(
            aspect.ratio = 1,
            axis.title.x = element_text(margin = unit(c(1, 0, 0, 0), "lines")),
            axis.title.y = element_text(margin = unit(c(0, 1, 0, 0), "lines"))
        )

    ggsave(output$pdf, plot = plt, width = 7, height = 7)

}

main(snakemake@input, snakemake@output, snakemake@log)
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
main <- function(input, output, log) {

    # Log function

    out <- file(log$out, open = "wt")

    err <- file(log$err, open = "wt")

    sink(out, type = "output")

    sink(err, type = "message")

    # Script function

    library(edgeR)

    library(limma)

    dge <- readRDS(input$rds)

    mod <- model.matrix(~ 0 + condition, data = dge$samples)

    colnames(mod) <- unique(dge$samples$condition)

    els <- voom(dge, design = mod)

    saveRDS(els, file = output$rds)

}

main(snakemake@input, snakemake@output, snakemake@log)
R From line 6 of scripts/voom.R
ShowHide 41 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/jma1991/rnaseq
Name: rnaseq
Version: 5
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: MIT License
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...