Workflow Steps and Code Snippets
191 tagged steps and code snippets that match keyword DESeq2
Snakemake workflow for generating gene expression counts from RNA-sequencing data.
16 17 | script: "../scripts/deseq2.R" |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 | log <- file(snakemake@log[[1]], open="wt") sink(log) sink(log, type="message") library("DESeq2") counts <- read.csv(snakemake@input[["counts"]], sep="\t", row.names="Geneid") counts <- subset(counts, select = -c(Chr, Start, End, Strand, Length)) samples <- read.csv(snakemake@params[["samples"]], sep="\t", stringsAsFactors = TRUE) units <- read.csv(snakemake@params[["units"]], sep="\t", stringsAsFactors = TRUE) coldata <- merge(unique(samples), units, by='sample') coldata <- subset(coldata, select=-c(fq1, fq2)) coldata[] <- lapply(coldata, as.factor) rownames(coldata) <- paste(coldata$sample, paste("t",coldata$time,sep=""), coldata$replicate, sep=".") # Speciffic steps needed for our analysis #-------------------------------------------------- counts$G_M.t0.1 <- counts$G.t0.1 counts$G_M.t0.2 <- counts$G.t0.2 zero_rows <- data.frame(rbind( c("G_M", "mitomycin", 1, 0), c("G_M", "mitomycin", 2, 0) )) colnames(zero_rows) <- colnames(coldata) rownames(zero_rows) <- c("G_M.t0.1", "G_M.t0.2") coldata <- rbind(coldata, zero_rows) #-------------------------------------------------- # Filter rows with less than 5 counts in at least 2 samples filter <- apply(counts, 1, function(x) length(x[x>5])>=2) filtered <- counts[filter,] # Create a DESeq data set with a full desing. ddsFull <- DESeqDataSetFromMatrix( countData = filtered, colData = coldata, design = as.formula(snakemake@params[["model"]])) # Use Likelihood-ratio test on a reduced model. dds <- DESeq(ddsFull, test="LRT", reduced = as.formula(snakemake@params[["reduced_model"]])) saveRDS(dds, file = snakemake@output[["rds"]]) alpha <- as.numeric(snakemake@params[["alpha"]]) result <- results(dds, alpha=alpha) # Construct a list of shrunken LFC for the coefficients of the model. logfc.list <- list() logfc.list[[1]] <- data.frame(result) for (i in 2:length(resultsNames(dds))) { coeff = resultsNames(dds)[i] res.t <- results(dds, name=coeff, test="Wald", alpha = alpha) out.t <- as.data.frame(res.t) out.t <- cbind(geneid = rownames(out.t), out.t) write.table( out.t, file=paste("deseq2/", coeff, ".tsv", sep=""), sep="\t", quote=FALSE, row.names=FALSE ) res_shrunken <- lfcShrink(dds, coef=coeff, type="apeglm", res = res.t) out.table <- as.data.frame(res_shrunken) out.table <- cbind(geneid = rownames(out.table), out.table) write.table( out.table, file=paste("deseq2/", coeff, "_shrunken.tsv", sep=""), sep="\t", quote=FALSE, row.names=FALSE ) svg(paste("deseq2/", coeff, "ma_plot.svg", sep="")) plotMA(result, ylim=c(-2,2)) dev.off() # Add the shrunken log2FC to the list. lfc <- res_shrunken[, "log2FoldChange", drop=FALSE] colnames(lfc) <- c(paste(coeff, "sLFC", sep=".")) logfc.list[[i]] <- lfc } # Write combined results in the main output. out.result <- do.call(cbind, logfc.list) out.result <- out.result[order(out.result$padj),] out.result <- cbind(geneid = rownames(out.result), out.result) write.table( out.result, file=snakemake@output[["table"]], sep="\t", quote=FALSE, row.names=FALSE ) # Use variance stabilising transformation and then plot PCA. vsd <- vst(ddsFull, blind=FALSE) svg(snakemake@output[["pca_plot"]]) plotPCA(vsd, intgroup=c("condition", "time"), ntop = 500) dev.off() |
76 77 | script: "../scripts/deseq2.R" |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | log <- file(snakemake@log[[1]], open="wt") sink(log) sink(log, type="message") library("DESeq2") parallel <- FALSE if (snakemake@threads > 1) { library("BiocParallel") # setup parallelization register(MulticoreParam(snakemake@threads)) parallel <- TRUE } # colData and countData must have the same sample order, but this is ensured # by the way we create the count matrix cts <- read.table(snakemake@input[["counts"]], header=TRUE, row.names="gene", check.names=FALSE) coldata <- read.table(snakemake@params[["samples"]], header=TRUE, row.names="sample", check.names=FALSE) dds <- DESeqDataSetFromMatrix(countData=cts, colData=coldata, design=~ condition) # remove uninformative columns dds <- dds[ rowSums(counts(dds)) > 1, ] # normalization and preprocessing dds <- DESeq(dds, parallel=parallel) saveRDS(dds, file=snakemake@output[[1]]) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | log <- file(snakemake@log[[1]], open="wt") sink(log) sink(log, type="message") library("DESeq2") parallel <- FALSE if (snakemake@threads > 1) { library("BiocParallel") # setup parallelization register(MulticoreParam(snakemake@threads)) parallel <- TRUE } dds <- readRDS(snakemake@input[[1]]) contrast <- c("condition", snakemake@params[["contrast"]]) res <- results(dds, contrast=contrast, parallel=parallel) # shrink fold changes for lowly expressed genes res <- lfcShrink(dds, contrast=contrast, res=res) # sort by p-value res <- res[order(res$padj),] # TODO explore IHW usage # store results svg(snakemake@output[["ma_plot"]]) plotMA(res, ylim=c(-2,2)) dev.off() write.table(as.data.frame(res), file=snakemake@output[["table"]]) |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 | log <- file(snakemake@log[[1]], open="wt") sink(log) sink(log, type="message") library("DESeq2") # load deseq2 data dds <- readRDS(snakemake@input[[1]]) # obtain normalized counts counts <- rlog(dds, blind=FALSE) svg(snakemake@output[[1]]) plotPCA(counts, intgroup=snakemake@params[["pca_labels"]]) dev.off() |
Snakemake workflow for comparison of differential abundance ranks (v0.3.0a2)
19 20 | script: "../scripts/R/deseq2.R" |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | library(biomformat) library(DESeq2) # Set logging information log <- file(snakemake@log[[1]], open="wt") sink(log) sink(log, type="message") # Load the input table print("Loading table...") table <- biomformat::read_biom(snakemake@input[["table"]]) table <- as.matrix(biomformat::biom_data(table)) # Load the metadata print("Loading metadata...") metadata <- read.table(snakemake@input[["metadata"]], sep="\t", header=T, row.names=1) covariate <- snakemake@params[[1]][["factor_name"]] target <- snakemake@params[[1]][["target_level"]] reference <- snakemake@params[[1]][["reference_level"]] confounders <- snakemake@params[[1]][["confounders"]] # Harmonize table and metadata samples print("Harmonizing table and metadata samples...") samples <- colnames(table) metadata <- subset(metadata, rownames(metadata) %in% samples) metadata[[covariate]] <- as.factor(metadata[[covariate]]) metadata[[covariate]] <- relevel(metadata[[covariate]], reference) sample_order <- row.names(metadata) table <- table[, sample_order] row.names(table) <- paste0("F_", row.names(table)) # Append F_ to features to avoid R renaming # Create the design formula print("Creating design formula...") design.formula <- paste0("~", covariate) if (length(confounders) != 0) { confounders_form = paste(confounders, collapse=" + ") design.formula <- paste0(design.formula, " + ", confounders_form) } design.formula <- as.formula(design.formula) # Run DESeq2 print("Running DESeq2...") dds <- DESeq2::DESeqDataSetFromMatrix( countData=table, colData=metadata, design=design.formula ) dds.results <- DESeq2::DESeq(dds, sfType="poscounts") saveRDS(dds.results, snakemake@output[[2]]) print("Saved RDS!") # Create results table and modify result names results <- DESeq2::results( dds.results, format="DataFrame", tidy=TRUE, cooksCutoff=FALSE, pAdjustMethod="BH", contrast=c(covariate, target, reference) ) row.names(results) <- gsub("^F_", "", row.names(table)) # Save results to output file write.table(results, file=snakemake@output[[1]], sep="\t") print("Saved differentials!") |
tool / biotools
DESeq2
R/Bioconductor package for differential gene expression analysis based on the negative binomial distribution. Estimate variance-mean dependence in count data from high-throughput sequencing assays and test for differential expression based on a model using the negative binomial distribution.