A Snakemake workflow to analyse Affymetrix expression arrays
Help improve this workflow!
This workflow has been published but could be further improved with some additional meta data:- Keyword(s) in categories input, output, operation, topic
You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .
A Snakemake workflow to analyse Affymetrix expression arrays
Contents
Overview
This workflow is used to analyse Affymetrix expression arrays at the probe level. It performs quality control, differential expression analysis, and gene set testing. Batch correction and blocking are also implemented. Any expression array which has a Bioconductor annotation package is supported.
Installation
Install Snakemake using the conda package manager:
$ conda create -c bioconda -c conda-forge --name snakemake snakemake
Deploy the workflow to your project directory:
$ git pull https://github.com/zifornd/arrays projects/arrays
Usage
Configure the workflow by editing the
config.yaml
file:
$ nano config/config.yaml
Define the samples by editing the
samples.tsv
file:
$ nano config/samples.tsv
Execute the workflow and install dependencies:
$ snakemake --cores all --use-conda
Documentation
See the Documentation file for all configuration, execution, and output information.
Contributing
See the Contributing file for ways to get started.
Please adhere to this project's Code of Conduct .
Authors
This workflow was developed by:
Citation
See the Citation file for ways to cite this workflow.
Acknowledgements
This workflow is based on the following research article:
Klaus B and Reisenauer S. An end to end workflow for differential gene expression using Affymetrix microarrays [version 2; peer review: 2 approved]. F1000Research 2018, 5:1384 (https://doi.org/10.12688/f1000research.8967.2)
The sticker artwork was adapted from BiocStickers and Bioicons by Guillaume Paumier .
License
This workflow is licensed under the
MIT
license.
Copyright © 2021, Zifo RnD Solutions
Code Snippets
19 20 | script: "../scripts/design.R" |
35 36 | script: "../scripts/contrasts.R" |
51 52 | script: "../scripts/limma.R" |
74 75 | script: "../scripts/toptable.R" |
95 96 | script: "../scripts/pvalue.R" |
119 120 | script: "../scripts/volcano.R" |
143 144 | script: "../scripts/heatmap.R" |
23 24 | script: "../scripts/goana.R" |
40 41 | script: "../scripts/topgo.R" |
60 61 | script: "../scripts/kegga.R" |
77 78 | script: "../scripts/topkegg.R" |
21 22 | script: "../scripts/import.R" |
39 40 | script: "../scripts/normalize.R" |
61 62 | script: "../scripts/annotate.R" |
79 80 | script: "../scripts/filter.R" |
94 95 | script: "../scripts/correct.R" |
21 22 | script: "../scripts/image.R" |
38 39 | script: "../scripts/hist.R" |
55 56 | script: "../scripts/boxplot.R" |
72 73 | script: "../scripts/pca.R" |
89 90 | script: "../scripts/ma.R" |
106 107 | script: "../scripts/mds.R" |
123 124 | script: "../scripts/dens.R" |
140 141 | script: "../scripts/hm.R" |
155 156 | script: "../scripts/msd.R" |
172 173 | script: "../scripts/rle.R" |
17 18 | shell: "conda create --quiet --yes --prefix {params.path} --strict-channel-priority --override-channels --channel conda-forge --channel bioconda --channel defaults bioconductor-{params.name}=8.7.0 1> {log.out} 2> {log.err}" |
31 32 | shell: "conda create --quiet --yes --prefix {params.path} --strict-channel-priority --override-channels --channel conda-forge --channel bioconda --channel defaults bioconductor-{params.name}=3.13.0 1> {log.out} 2> {log.err}" |
45 46 | shell: "conda create --quiet --yes --prefix {params.path} --strict-channel-priority --override-channels --channel conda-forge --channel bioconda --channel defaults bioconductor-{params.name}=3.14.1 1> {log.out} 2> {log.err}" |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 | main <- function(input, output, params, log) { # Log out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script library(oligo) library( params$organism, character.only = TRUE, lib.loc = "resources/bioconductor/organism/lib/R/library" ) library( params$annotation, character.only = TRUE, lib.loc = "resources/bioconductor/annotation/lib/R/library" ) obj <- readRDS(input$rds) pkg <- eval(parse(text = params$annotation)) ids <- featureNames(obj) ann <- data.frame( PROBEID = ids, ENTREZID = I(mapIds(pkg, keys = ids, column = "ENTREZID", keytype = "PROBEID", multiVals = "list")), SYMBOL = I(mapIds(pkg, keys = ids, column = "SYMBOL", keytype = "PROBEID", multiVals = "list")), GENENAME = I(mapIds(pkg, keys = ids, column = "GENENAME", keytype = "PROBEID", multiVals = "list")), row.names = ids ) fData(obj) <- ann saveRDS(obj, file = output$rds) } main(snakemake@input, snakemake@output, snakemake@params, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | boxplot <- function(object, col) { UseMethod("boxplot") } boxplot.ExpressionSet <- function(object, col) { m <- exprs(object) d <- data.frame( x = as.vector(m), y = colnames(m)[col(m)], i = pData(object)[, col][col(m)] ) p <- ggplot(d, aes(x, y, fill = i)) + geom_boxplot(coef = Inf) + labs(x = "Intensity", y = "Array", fill = NULL) + theme_bw() } main <- function(input, output, params, log) { # Log out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script library(ggplot2) library(oligo) x <- readRDS(input$rds) p <- boxplot(x, col = params$col) ggsave(output$pdf, plot = p) } main(snakemake@input, snakemake@output, snakemake@params, snakemake@log) |
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | makeContrasts <- function(object, config) { data <- pData(object) condition <- factor(data$condition) groups <- levels(condition) contrasts <- sapply(config$contrasts, paste, collapse = "-") contrasts <- limma::makeContrasts(contrasts = contrasts, levels = groups) } main <- function(input, output, params, log, config) { # Log out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script library(limma) library(oligo) object <- readRDS(input$rds) contrasts <- makeContrasts(object, config) saveRDS(contrasts, file = output$rds) } main(snakemake@input, snakemake@output, snakemake@params, snakemake@log, snakemake@config) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 | main <- function(input, output, params, log) { # Log out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script library(limma) object <- readRDS(input$rds) condition <- factor(object$condition) batch <- factor(object$batch) design <- model.matrix(~ 0 + condition) if (nlevels(batch) > 1) { corrected <- removeBatchEffect(x = exprs(object), batch = batch, design = design) exprs(object) <- corrected } saveRDS(object, file = output$rds) } main(snakemake@input, snakemake@output, snakemake@params, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | plotDensities <- function(object, ...) { UseMethod("plotDensities") } plotDensities.ExpressionSet <- function(object, col) { m <- exprs(object) d <- data.frame( x = as.vector(m), i = colnames(m)[col(m)], j = pData(object)[, col][col(m)] ) p <- ggplot(d, aes(x, group = i, colour = j)) + stat_density(geom = "line", position = "identity") + labs(x = "Intensity", y = "Density", colour = NULL) + theme_bw() } main <- function(input, output, params, log) { # Log out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script library(ggplot2) library(oligo) x <- readRDS(input$rds) p <- plotDensities(x, col = params$col) ggsave(output$pdf, plot = p, width = 8, height = 6, scale = 0.8) } main(snakemake@input, snakemake@output, snakemake@params, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 | makeDesign <- function(object) { # Get phenotype data data <- pData(object) names <- colnames(data) # Set condition factor if ("condition" %in% names) { condition <- factor(data$condition) n.condition <- nlevels(condition) is.condition <- n.condition > 1 } else { is.condition <- FALSE } # Set batch factor if ("batch" %in% names) { batch <- factor(data$batch) n.batch <- nlevels(batch) is.batch <- n.batch > 1 } else { is.batch <- FALSE } # Set batch2 factor if ("batch2" %in% names) { batch2 <- factor(data$batch2) n.batch2 <- nlevels(batch2) is.batch2 <- n.batch2 > 1 } else { is.batch2 <- FALSE } # Construct design matrix if (is.condition & !is.batch & !is.batch2) { design <- model.matrix(~ 0 + condition) } if (is.condition & is.batch & !is.batch2) { design <- model.matrix(~ 0 + condition + batch) } if (is.condition & !is.batch & is.batch2) { design <- model.matrix(~ 0 + condition + batch2) } if (is.condition & is.batch & is.batch2) { design <- model.matrix(~ 0 + condition + batch + batch2) } # Rename condition coefficients which.condition <- seq_len(n.condition) colnames(design)[which.condition] <- levels(condition) # Return design matrix design } main <- function(input, output, params, log, config) { # Log out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script library(limma) library(oligo) object <- readRDS(input$rds) design <- makeDesign(object) saveRDS(design, file = output$rds) } main(snakemake@input, snakemake@output, snakemake@params, snakemake@log, snakemake@config) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 | filterByType <- function(object) { # Filter control probes data <- fData(object) ctrl <- "ControlType" %in% colnames(data) if (ctrl) { keep <- data$ControlType == 0L } else { keep <- rep(TRUE, times = nrow(data)) } object <- object[keep, , drop = FALSE] } filterByUniq <- function(object) { # Filter non-unique probes data <- fData(object) all.equal <- function(x) length(unique(x)) == 1L keep <- data.frame( ENTREZID = sapply(data$ENTREZID, all.equal), SYMBOL = sapply(data$SYMBOL, all.equal), GENENAME = sapply(data$GENENAME, all.equal) ) keep <- apply(keep, 1, all) object <- object[keep, , drop = FALSE] # Unlist unique probes data <- fData(object) data <- data.frame( PROBEID = data$PROBEID, ENTREZID = unlist(data$ENTREZID), SYMBOL = unlist(data$SYMBOL), GENENAME = unlist(data$GENENAME), row.names = data$PROBEID ) fData(object) <- data object } filterByName <- function(object) { # Filter unnamed probes data <- fData(object) not.na <- function(x) !is.na(x) keep <- sapply(data$SYMBOL, not.na) object <- object[keep, , drop = FALSE] } filterByExpr <- function(object, group = NULL, exprs = NULL) { # Filter unexpressed probes y <- exprs(object) group <- as.factor(pData(object)[, group]) n <- tabulate(group) size <- min(n[n > 0L]) keep <- rowSums(y >= exprs) >= size object <- object[keep, , drop = FALSE] } main <- function(input, output, params, log) { # Log out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script library(oligo) x <- readRDS(input$rds) x <- filterByType(x) x <- filterByUniq(x) x <- filterByName(x) x <- filterByExpr(x, group = params$group, exprs = params$exprs) saveRDS(x, file = output$rds) } main(snakemake@input, snakemake@output, snakemake@params, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | main <- function(input, output, params, log) { # Log out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script library(limma) library( params$organism, character.only = TRUE, lib.loc = "resources/bioconductor/organism/lib/R/library" ) fit <- readRDS(input$rds) con <- paste(params$contrast, collapse = "-") out <- goana( de = fit, coef = con, geneid = "ENTREZID", FDR = params$FDR, species = strsplit(params$organism, ".", fixed = TRUE)[[1]][2] ) write.table( out, file = output$tsv, quote = FALSE, sep = '\t', col.names = NA ) } main(snakemake@input, snakemake@output, snakemake@params, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 | pheatmap.mat <- function(x) { # Scale rows by 'variance-aware' Z-transformation M <- rowMeans(x, na.rm = TRUE) DF <- ncol(x) - 1 isNA <- is.na(x) anyNA <- any(isNA) if (anyNA) { mode(isNA) <- "integer" DF <- DF - rowSums(isNA) DF[DF == 0] <- 1 } x <- x - M V <- rowSums(x ^ 2, na.rm = TRUE) / DF x <- x / sqrt(V + 0.01) } pheatmap.color <- function(x) { # Return color vector colorRampPalette(rev(RColorBrewer::brewer.pal(n = 5, name = x)))(100) } pheatmap.breaks <- function(x) { # Return breaks vector abs <- max(abs(x)) abs <- min(abs, 5) seq(-abs, +abs, length.out = 101) } pheatmap.cluster_rows <- function(x) { # Return hclust object for rows hclust(dist(x, method = "euclidean"), method = "complete") } pheatmap.cluster_cols <- function(x) { # Return hclust object for columns hclust(dist(t(x), method = "euclidean"), method = "complete") } pheatmap.annotation_col <- function(x) { # Return column annotations data.frame( Condition = x$condition, row.names = colnames(x) ) } main <- function(input, output, params, log) { # Log function out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script function library(oligo) library(pheatmap) obj <- readRDS(input$rds) res <- read.delim(input$tsv) ## ind <- order(res$P.Value)[seq_len(params$ntop)] ids <- as.character(res$PROBEID[ind]) obj <- obj[ids, ] ## cpm <- exprs(obj) mat <- pheatmap.mat(cpm) lab <- fData(obj)$SYMBOL pheatmap( mat = mat, color = pheatmap.color("RdBu"), breaks = pheatmap.breaks(mat), cellwidth = 10, cellheight = 10, cluster_rows = pheatmap.cluster_rows(mat), cluster_cols = pheatmap.cluster_cols(cpm), annotation_col = pheatmap.annotation_col(obj), show_colnames = FALSE, labels_row = lab, filename = output$pdf ) } main(snakemake@input, snakemake@output, snakemake@params, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | main <- function(input, output, params, log) { # Log out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script function library(ggplot2) library(oligo) x <- readRDS(input$rds) m <- rowMeans(exprs(x)) df <- data.frame(mean = m) gg <- ggplot(df, aes(mean)) + geom_histogram(bins = 100) + labs(x = "Average Expression", y = "Frequency") + theme_bw() ggsave(output$pdf, plot = gg) } main(snakemake@input, snakemake@output, snakemake@params, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | heatmap <- function(object) { UseMethod("heatmap") } heatmap.ExpressionSet <- function(object) { require(RColorBrewer) expr <- exprs(object) dist <- dist(t(expr)) brew <- RColorBrewer::brewer.pal(5, "Reds") cols <- colorRampPalette(rev(brew))(255) anno <- data.frame( Condition = object$condition, row.names = colnames(object) ) pheatmap( mat = as.matrix(dist), color = cols, clustering_distance_rows = dist, clustering_distance_cols = dist, treeheight_row = 0, annotation_row = anno, show_colnames = FALSE, silent = TRUE ) } main <- function(input, output, params, log) { # Log out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script library(oligo) library(pheatmap) x <- readRDS(input$rds) p <- heatmap(x) pdf(output$pdf) grid::grid.newpage() grid::grid.draw(p$gtable) dev.off() } main(snakemake@input, snakemake@output, snakemake@params, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 | main <- function(input, output, params, log) { # Log out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script library(oligo) library( params$platform, character.only = TRUE, lib.loc = "resources/bioconductor/platform/lib/R/library" ) obj <- readRDS(input$rds) ids <- sampleNames(obj) itr <- seq_along(ids) png(output$png) for (i in itr) { image(obj, which = i, main = ids[i]) } dev.off() } main(snakemake@input, snakemake@output, snakemake@params, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 | main <- function(input, output, params, log) { # Log out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script library(oligo) library( params$platform, character.only = TRUE, lib.loc = "resources/bioconductor/platform/lib/R/library" ) dat <- read.delim(input$tsv, row.names = "sample") set <- read.celfiles(dat$celfile, phenoData = AnnotatedDataFrame(dat)) saveRDS(set, file = output$rds) } main(snakemake@input, snakemake@output, snakemake@params, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | main <- function(input, output, params, log) { # Log out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script library(limma) library( params$organism, character.only = TRUE, lib.loc = "resources/bioconductor/organism/lib/R/library" ) fit <- readRDS(input$rds) con <- paste(params$contrast, collapse = "-") out <- kegga( de = fit, coef = con, geneid = "ENTREZID", FDR = params$FDR, species = strsplit(params$organism, ".", fixed = TRUE)[[1]][2] ) write.table( out, file = output$tsv, quote = FALSE, sep = '\t', col.names = NA ) } main(snakemake@input, snakemake@output, snakemake@params, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 | lmFit <- function(object, design) { # Get phenotype data data <- pData(object) names <- colnames(data) # Set block factor if ("block" %in% names) { block <- factor(data$block) n.block <- nlevels(block) is.block <- n.block > 1 } else { is.block <- FALSE } # Fit linear model ... if (is.block) { # ... with block output <- limma::duplicateCorrelation(object, design, block = block) object <- limma::lmFit( object = object, design = design, block = block, correlation = output$consensus.correlation ) } else { # ... without block object <- limma::lmFit( object = object, design = design ) } object } main <- function(input, output, params, log, config) { # Log out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script library(limma) library(oligo) object <- readRDS(input$rds[1]) design <- readRDS(input$rds[2]) contrasts <- readRDS(input$rds[3]) object <- lmFit(object, design) object <- contrasts.fit(object, contrasts) object <- eBayes(object) saveRDS(object, file = output$rds) } main(snakemake@input, snakemake@output, snakemake@params, snakemake@log, snakemake@config) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | plotMA <- function(object, ...) { UseMethod("plotMA") } plotMA.ExpressionSet <- function(object, col) { mat <- exprs(object) med <- apply(mat, 1, median, na.rm = TRUE) M <- mat - med A <- (mat + med) / 2 df <- data.frame( M = as.vector(M), A = as.vector(A), I = colnames(mat)[col(mat)], group = pData(object)[, col][col(mat)] ) g <- ggplot(df, aes(A, M, colour = group)) + geom_point(size = 0.05) + facet_wrap_paginate(~ I, ncol = 4, nrow = 4) g } main <- function(input, output, params, log) { # Log out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script library(ggforce) library(oligo) obj <- readRDS(input$rds) plt <- plotMA(obj, col = params$col) num <- n_pages(plt) itr <- seq_len(num) pdf(output$pdf) for (i in itr) { print(plt + facet_wrap_paginate(~ I, ncol = 4, nrow = 4, page = i)) } dev.off() } main(snakemake@input, snakemake@output, snakemake@params, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 | plotMDS <- function(object, ...) { UseMethod("plotMDS") } plotMDS.ExpressionSet <- function(object, col) { mat <- exprs(object) var <- matrixStats::rowVars(mat) num <- min(500, length(var)) ind <- order(var, decreasing = TRUE)[seq_len(num)] dst <- dist(t(mat[ind, ])) mds <- cmdscale(as.matrix(dst)) dat <- data.frame( MD1 = mds[, 1], MD2 = mds[, 2], group = pData(object)[, col] ) ggplot(dat, aes(MD1, MD2, colour = group)) + geom_point(size = 3) + labs(x = "MDS 1", y = "MDS 2", colour = "Group") } main <- function(input, output, params, log) { # Log out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script library(ggplot2) library(oligo) obj <- readRDS(input$rds) plt <- plotMDS(obj, col = params$col) ggsave(output$pdf, plot = plt) } main(snakemake@input, snakemake@output, snakemake@params, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 | plotSA <- function(x) { UseMethod("plotSA") } plotSA.ExpressionSet <- function(x) { require("hexbin") E <- exprs(x) mu <- matrixStats::rowMeans2(E) sd <- matrixStats::rowSds(E) df <- data.frame(mean = mu, rank = rank(mu), sd = sd) gg <- ggplot(df, aes(rank, sd)) + geom_hex(bins = 100) + labs(x = "Mean", y = "SD") } main <- function(input, output, params, log) { # Log function out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script function library(ggplot2) library(oligo) x <- readRDS(input$rds) p <- plotSA(x) ggsave(output$pdf, plot = p) } main(snakemake@input, snakemake@output, snakemake@params, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | main <- function(input, output, params, log) { # Log out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script library(oligo) library( params$platform, character.only = TRUE, lib.loc = "resources/bioconductor/platform/lib/R/library" ) obj <- readRDS(input$rds) obj <- rma(obj) saveRDS(obj, file = output$rds) } main(snakemake@input, snakemake@output, snakemake@params, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | plotPCA <- function(x, ...) { UseMethod("plotPCA") } plotPCA.ExpressionSet <- function(x, col) { mat <- exprs(x) var <- matrixStats::rowVars(mat) num <- min(500, length(var)) ind <- order(var, decreasing = TRUE)[seq_len(num)] pca <- prcomp(t(mat[ind, ])) pct <- (pca$sdev ^ 2) / sum(pca$sdev ^ 2) dat <- data.frame(PC1 = pca$x[,1], PC2 = pca$x[,2], group = pData(x)[, col]) ggplot(dat, aes_string(x = "PC1", y = "PC2", color = "group")) + geom_point(size = 3) + xlab(paste0("PC1: ", round(pct[1] * 100), "% variance")) + ylab(paste0("PC2: ", round(pct[2] * 100), "% variance")) + coord_fixed() } main <- function(input, output, params, log) { # Log function out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script function library(ggplot2) library(oligo) x <- readRDS(input$rds) p <- plotPCA(x, col = params$col) ggsave(output$pdf, plot = p) } main(snakemake@input, snakemake@output, snakemake@params, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 | main <- function(input, output, params, log) { # Log out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script library(ggplot2) res <- read.delim(input$tsv) plt <- ggplot(res, aes(x = P.Value)) + geom_histogram(binwidth = 0.05, colour = "black", fill = "gainsboro", boundary = 0) + labs(x = expression(italic(p)*"-values"), y = "Frequency") + theme_bw() ggsave(output$pdf, plot = plt) } main(snakemake@input, snakemake@output, snakemake@params, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 | plotRLE <- function(object, ...) { UseMethod("plotRLE") } plotRLE.ExpressionSet <- function(object, col) { mat <- exprs(object) med <- apply(mat, 1, median, na.rm = TRUE) M <- mat - med d <- data.frame( x = as.vector(M), y = colnames(M)[col(M)], i = pData(object)[, col][col(M)] ) l <- quantile(d$x, c(0.01, 0.99)) p <- ggplot(d, aes(x, y, fill = i)) + geom_boxplot(coef = Inf) + scale_x_continuous(limits = l) + labs(x = "RLE", y = "Array", fill = NULL) + theme_bw() } main <- function(input, output, params, log) { # Log out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script library(ggplot2) library(oligo) x <- readRDS(input$rds) p <- plotRLE(x, col = params$col) ggsave(output$pdf, plot = p) } main(snakemake@input, snakemake@output, snakemake@params, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 | which.pmin <- function(...) unname(apply(cbind(...), 1, which.min)) main <- function(input, output, params, log) { # Log out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Catch fs <- file.size(input$tsv) if (fs == 1) { file.create(output$pdf) return(NULL) } # Script library(ggplot2) library(limma) x <- read.delim(input$tsv, row.names = 1) x <- subset(x, !is.na(Term)) # NA in Term column triggers error in topGO x <- split(x, x$Ont) x <- lapply(x, topGO, number = params$number, truncate.term = 50L) x <- do.call(rbind, x) x$P <- pmin(x$P.Up, x$P.Down) i <- which.pmin(x$P.Up, x$P.Down) x$Direction <- c("Up", "Down")[i] p <- ggplot(x, aes(-log10(P), reorder(Term, -P), colour = Direction)) + geom_point() + geom_segment(aes(x = 0, xend = -log10(P), y = reorder(Term, -P), yend = reorder(Term, -P))) + labs(x = expression("-Log"[10]*"("*italic(p)*"-value)"), y = "Term") + facet_wrap(~ Ont, ncol = 1, scales = "free") + theme_bw() ggsave(output$pdf, plot = p) } main(snakemake@input, snakemake@output, snakemake@params, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | which.pmin <- function(...) unname(apply(cbind(...), 1, which.min)) main <- function(input, output, params, log) { # Log out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Catch fs <- file.size(input$tsv) if (fs == 1) { file.create(output$pdf) return(NULL) } # Script library(ggplot2) library(limma) x <- read.delim(input$tsv, row.names = 1) x <- subset(x, !is.na(Pathway)) # NA in Pathway column triggers error in topKEGG x <- topKEGG(x, number = params$number, truncate.path = 50L) x$P <- pmin(x$P.Up, x$P.Down) i <- which.pmin(x$P.Up, x$P.Down) x$Direction <- c("Up", "Down")[i] p <- ggplot(x, aes(-log10(P), reorder(Pathway, -P), colour = Direction)) + geom_point() + geom_segment(aes(x = 0, xend = -log10(P), y = reorder(Pathway, -P), yend = reorder(Pathway, -P))) + labs(x = expression("-Log"[10]*"("*italic(p)*"-value)"), y = "Pathway") + theme_bw() h <- params$number * 0.25 ggsave(output$pdf, plot = p, width = 8, height = h, scale = 0.8) } main(snakemake@input, snakemake@output, snakemake@params, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | main <- function(input, output, params, log) { # Log out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script library(limma) fit <- readRDS(input$rds) con <- paste(params[["contrast"]], collapse = "-") res <- topTable(fit, coef = con, number = Inf, sort.by = "none") write.table(res, file = output$tsv, quote = FALSE, sep = '\t', row.names = FALSE) } main(snakemake@input, snakemake@output, snakemake@params, snakemake@log) |
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | add.status <- function(x, PAdj = 0.05) { # Determine the status of each gene in the results table x$status <- factor("NS", levels = c("Up", "NS", "Down")) x$status[x$logFC > 0 & x$adj.P.Val < PAdj] <- "Up" x$status[x$logFC < 0 & x$adj.P.Val < PAdj] <- "Down" return(x) } add.symbol <- function(x, n = 50) { # Show symbol for the top N genes in the results table i <- sort(x$adj.P.Val, decreasing = FALSE, index.return = TRUE)$ix[seq_len(n)] x$symbol <- "" x$symbol[i] <- x$SYMBOL[i] return(x) } main <- function(input, output, params, log) { # Log function out <- file(log$out, open = "wt") err <- file(log$err, open = "wt") sink(out, type = "output") sink(err, type = "message") # Script function library(ggplot2) library(ggrepel) library(scales) res <- read.delim(input$tsv) res <- add.status(res, PAdj = params$PAdj) res <- add.symbol(res, n = params$n) res <- res[order(res$adj.P.Val, decreasing = TRUE), ] col <- c("Up" = "#d8b365", "NS" = "#f5f5f5", "Down" = "#5ab4ac") lab <- c( "Up" = sprintf("Up (%s)", comma(sum(res$status == "Up"))), "NS" = sprintf("NS (%s)", comma(sum(res$status == "NS"))), "Down" = sprintf("Down (%s)", comma(sum(res$status == "Down"))) ) plt <- ggplot(res, aes(logFC, -log10(P.Value), colour = status, label = symbol)) + geom_point() + scale_colour_manual(values = col, labels = lab) + geom_text_repel(size = 1.88, colour = "#000000", show.legend = FALSE, max.overlaps = Inf) + labs( x = expression("log"[2]*" fold change"), y = expression("-log"[10]*"("*italic(P)*")"), colour = "Status" ) + theme_bw() + theme( aspect.ratio = 1, axis.title.x = element_text(margin = unit(c(1, 0, 0, 0), "lines")), axis.title.y = element_text(margin = unit(c(0, 1, 0, 0), "lines")) ) ggsave(output$pdf, plot = plt) } main(snakemake@input, snakemake@output, snakemake@params, snakemake@log) |
Support
- Future updates
Related Workflows





