This pipeline contains the data processing steps used for Thuy Ngo's 2019 cell-free RNA manuscript

public 1yr ago 0 bookmarks

View Workflow

cfrna-seq-pipeline-ngo-manuscript-2019 — View Workflow

Help improve this workflow!

This workflow has been published but could be further improved with some additional meta data:

Keyword(s) in categories input, output, operation, topic

You can help improve this workflow by suggesting the addition or removal of keywords, suggest changes and report issues, or request to become a maintainer of the Workflow .

cfRNA-seq Pipeline

Pipeline to process raw fastq files for Thuy Ngo's manuscript: "Plasma cell-free RNA profiling enables pan-cancer detection and distinguishes cancer from pre-malignant conditions"

This is a package of Python and R scri

Code Snippets

run:
    sickle = config["sickle_tool"]

    shell("{sickle} pe -f {input.fwd} -r {input.rev}  -l 40 -q 20 -t sanger  -o {output.fwd} -p {output.rev} -s {output.single} &> {input.fwd}.log")

SnakeMake sickle From line 9 of rules/align_rmdp.smk

shell:
    """fastqc --outdir samples/fastqc/{wildcards.sample} --extract  -f fastq {input.fwd} {input.rev}"""

SnakeMake FastQC From line 23 of rules/align_rmdp.smk

shell:
    """fastq_screen --aligner bowtie2 --conf {params.conf} --outdir samples/fastqscreen/{wildcards.sample} {input.fwd} {input.rev}"""

SnakeMake Bowtie 2 From line 41 of rules/align_rmdp.smk

run:
     STAR=config["star_tool"],
     pathToGenomeIndex = config["star_index"]

     shell("""
            {STAR} --runThreadN {threads} --runMode alignReads --genomeDir {pathToGenomeIndex} \
            --readFilesIn {input.fwd} {input.rev} \
            --outFileNamePrefix samples/star/{wildcards.sample}_bam/ \
            --sjdbGTFfile {params.gtf} --quantMode GeneCounts \
            --sjdbGTFtagExonParentGene gene_name \
            --outSAMtype BAM SortedByCoordinate \
            #--readFilesCommand zcat \
            --twopassMode Basic
            """)

SnakeMake From line 54 of rules/align_rmdp.smk

shell:
    """samtools index {input} {output}"""

SnakeMake SAMtools From line 76 of rules/align_rmdp.smk

script:
    "../scripts/compile_star_log.py"

SnakeMake From line 84 of rules/align_rmdp.smk

run:
    picard_insert_size=config["insert_size_tool"]

    shell("java -jar {picard_insert_size} \
           I={input} \
           O={output.metrics} \
           H={output.hist}")

SnakeMake From line 93 of rules/align_rmdp.smk

run:
  picard=config["picard_tool"]

  shell("java -Xmx3g -jar {picard} \
  INPUT={input} \
  OUTPUT={output} \
  METRICS_FILE=samples/genecounts_rmdp/{wildcards.sample}_bam/{wildcards.sample}.rmd.metrics.text \
  REMOVE_DUPLICATES=true")

SnakeMake From line 109 of rules/align_rmdp.smk

shell:
  """samtools sort -O bam -n {input} -o {output}"""

SnakeMake SAMtools From line 129 of rules/align_rmdp.smk

wrapper:
    "0.17.0/bio/samtools/stats"

SnakeMake From line 142 of rules/align_rmdp.smk

shell:
    """htseq-count \
            -f bam \
            -r name \
            -s reverse \
            -m intersection-strict \
            {input} \
            {params.gtf} > {output}"""

SnakeMake htseqcount From line 159 of rules/align_rmdp.smk

shell:
    """htseq-count \
           -f bam \
           -r name \
           -s reverse \
           -m intersection-strict \
           -i exon_id \
           --additional-attr=gene_name \
           {input} \
           {params.exon_gtf} > {output}"""

SnakeMake htseqcount From line 178 of rules/align_rmdp.smk

script:
    "../scripts/compile_counts_table.py"

SnakeMake From line 195 of rules/align_rmdp.smk

script:
    "../scripts/compile_counts_table_w_stats.py"

SnakeMake From line 204 of rules/align_rmdp.smk

script:
    "../scripts/compile_exon_counts.R"

SnakeMake From line 213 of rules/align_rmdp.smk

script:
    "../scripts/deseq2-init.R"

SnakeMake DESeq2 From line 17 of rules/deseq.smk

script:
    "../scripts/deseq2_pairwise.R"

SnakeMake From line 42 of rules/deseq.smk

script:
    "../scripts/deseq2_group.R"

SnakeMake From line 66 of rules/deseq.smk

script:
    "../scripts/QC.R"

SnakeMake From line 89 of rules/deseq.smk

script:
    "../scripts/qplot.R"

SnakeMake From line 103 of rules/deseq.smk

script:
    "../scripts/density_plot.R"

SnakeMake From line 119 of rules/deseq.smk

script:
    "../scripts/runGOforDESeq2.R"

SnakeMake From line 137 of rules/deseq.smk

script:
    "../scripts/RNAseq_makeVolcano.R"

SnakeMake From line 152 of rules/deseq.smk

script:
    "../scripts/permutation_test.R"

SnakeMake From line 170 of rules/deseq.smk

script:
    "../scripts/run_glimma.R"

SnakeMake From line 185 of rules/deseq.smk

script:
    "../scripts/run_glimma_mds.R"

SnakeMake From line 198 of rules/deseq.smk

shell:
    "insertion_profile.py -s '{params.seq_layout}' -i {input} -o rseqc/insertion_profile/{wildcards.sample}/{wildcards.sample}"

SnakeMake From line 13 of rules/omic_qc.smk

shell:
    "inner_distance.py -i {input} -o rseqc/inner_distance/{wildcards.sample}/{wildcards.sample} -r {params.bed}"

SnakeMake From line 28 of rules/omic_qc.smk

shell:
    "clipping_profile.py -i {input} -s '{params.seq_layout}' -o rseqc/clipping_profile/{wildcards.sample}/{wildcards.sample}"

SnakeMake From line 44 of rules/omic_qc.smk

shell:
   "read_distribution.py -i {input} -r {params.bed} > {output}"

SnakeMake From line 57 of rules/omic_qc.smk

script:
    "../scripts/get_rd.py"

SnakeMake From line 66 of rules/omic_qc.smk

shell:
    "read_GC.py -i {input} -o rseqc/read_GC/{wildcards.sample}/{wildcards.sample}"

SnakeMake From line 79 of rules/omic_qc.smk

import pandas as pd


"""Accepts a HTSeq output directory and compiles {sample}_htseq_gene_count.txt into a joined tab sep file
Args:
    snakemake.input (list): list of globbed wildcards HTSeq 
    snakemake.output[0] (str): data/{params.project_id}_counts.txt

Returns:
    Compiled STAR gene counts table as tab delimited file.
"""

tables = [pd.read_csv(fh, sep='\t', index_col=0, names=[fh.split('/')[-1].split('_')[0]]) for fh in snakemake.input]
joined_table = pd.concat(tables, axis=1)
filtered_joined = joined_table.iloc[:-5, :]
filtered_joined_sorted = filtered_joined.reindex(sorted(filtered_joined.columns), axis = 1)
filtered_joined_sorted.to_csv(snakemake.output[0], sep='\t')

Python Pandas STAR HTSeq From line 1 of scripts/compile_counts_table.py

import pandas as pd


"""Accepts a HTSeq output directory and compiles {sample}_htseq_gene_count.txt into a joined tab sep file
Args:
    snakemake.input (list): list of globbed wildcards HTSeq 
    snakemake.output[0] (str): data/{params.project_id}_counts.txt
Returns:
    Compiled STAR gene counts table as tab delimited file.
"""

tables = [pd.read_csv(fh, sep='\t', index_col=0, names=[fh.split('/')[-1].split('_')[0]]) for fh in snakemake.input]
joined_table = pd.concat(tables, axis=1)
joined_sorted = joined_table.reindex(sorted(joined_table.columns), axis = 1)
joined_sorted.to_csv(snakemake.output[0], sep='\t')

Python Pandas STAR HTSeq From line 1 of scripts/compile_counts_table_w_stats.py

files = snakemake@input

#make a empty dataframe, ncol= number of samples +1, nrow= number of rows in gene count file
all = data.frame(matrix(ncol = length(files) + 2, nrow=nrow(read.table(files[[1]], header = F, sep = "\t"))))

#get row names
samp1=read.table(file = files[[1]], header = F, sep = "\t")
genelist=samp1[,1:2]
genelist=as.data.frame(genelist)
colnames(genelist)=c("ensembl_gene_id","gene_id")
head(genelist)

#combine counts from files
for(i in 1:length(files)){
  counts = read.table(file = files[[i]], header = F, stringsAsFactors  = F, sep = "\t")
  print(files[[i]])
  samp=gsub("_htseq_exon_count.txt","",files[[i]])

  colnames(counts)=c("ensembl_gene_id","gene_id",samp)

  if(i==1){
    all=merge(genelist,counts)  
  } 
  else
  {all=merge(all,counts) }

}

write.table(all, file = snakemake@output[[1]], quote = F, sep = "\t", row.names = F)

R From line 1 of scripts/compile_exon_counts.R

import pandas as pd


"""Function accepts a STAR output directory and compiles all sample information from Log.final.out
Args:
    snakemake.input (list): list of globbed wildcards STAR Log.final.out
    project_title (str): Project title for compiled STAR mapping statistics

Returns:
    Compiled STAR log.final.out as tab delimited file.
"""

tables = [pd.read_csv(fh, sep = '\t', index_col = 0, names = [fh.split('/')[-2]]) for fh in snakemake.input]
joined_table = pd.concat(tables, axis=1)
joined_table_sorted = joined_table.reindex(sorted(joined_table.columns), axis = 1)
joined_table_sorted.to_csv(snakemake.output[0], sep='\t')

Python Pandas STAR From line 1 of scripts/compile_star_log.py

library(DESeq2)
library(ggplot2)
library(reshape2)
library(data.table)
library(plyr)
library(RColorBrewer)


rld <- snakemake@input[['rld']]
cat(sprintf(c('rld: ', rld, '\n')))


condition <- snakemake@params[['linear_model']]
cat(sprintf(c('condition: ', condition, '\n')))

project_id <- snakemake@params[['project_id']]

density_plot <- snakemake@output[['density']]
cat(sprintf(c('Density plot : ', density_plot, '\n')))


colors <- snakemake@params['colors']
discrete <- snakemake@params['discrete']

# function to grab the ggplot2 colours
gg_color_hue <- function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

rld = readRDS(rld)
normed_values = assay(rld)
normed_t = t(normed_values)
meta = colData(rld)

if(colors[[1]] !='NA' & discrete[[1]] =='NA'){
    if(brewer.pal.info[colors[[1]],]$maxcolors >= length(levels(meta[[condition]]))) {
        pal <- brewer.pal(length(levels(meta[[condition]])),name=colors[[1]])
    } 
} else if(discrete[[1]] != 'NA' & length(discrete)==length(levels(meta[[condition]]))){
        pal <- unlist(discrete)
} else {
        pal <- gg_color_hue(length(levels(meta[[condition]])))
}

joined_counts = cbind(meta[condition],normed_t)

x = as.data.table(joined_counts)
mm <- melt(x,id=condition)

mu <- ddply(mm, condition, summarise, grp.mean=mean(value))
pdf(density_plot)
p<-ggplot(mm, aes_string(x='value', color=condition)) +
  geom_density()+
  geom_vline(data=mu, aes_string(xintercept='grp.mean', color=condition),
             linetype="dashed") + xlab('regularized log expression') + 
  scale_color_manual(values=pal) +
  ggtitle(eval(project_id)) + theme(plot.title = element_text(hjust = 0.5))
p
dev.off()

R ggplot2 data.table RColorBrewer reshape2 plyr From line 1 of scripts/density_plot.R

library("DESeq2")
library("ggplot2")
library("pheatmap")
library("dplyr")
library("vsn")
library("RColorBrewer")
library("genefilter")

cat(sprintf(c('Working directory',getwd())))

cat(sprintf('Setting parameters'))

pca_plot <- snakemake@output[['pca']]
cat(sprintf(c('PCA plot: ',pca_plot)))

labels <- snakemake@params[['pca_labels']]
cat(sprintf(c('PCA Labels: ',labels)))

sd_mean_plot <- snakemake@output[['sd_mean_plot']]
cat(sprintf(c('SD Mean plot: ',sd_mean_plot,'\n')))

distance_plot <- snakemake@output[['distance_plot']]
cat(sprintf(c('Distance plot: ',distance_plot,'\n')))

heatmap_plot <- snakemake@output[['heatmap_plot']]
cat(sprintf(c('Heatmap Plot: ', heatmap_plot, '\n')))

rds_out <- snakemake@output[['rds']]
cat(sprintf(c('RDS Output: ', rds_out, '\n')))

rld_out <- snakemake@output[['rld_out']]
cat(sprintf(c('RLD Output: ', rld_out, '\n')))

counts <- snakemake@input[['counts']]
cat(sprintf(c('Counts table: ', counts, '\n')))

metadata <- snakemake@params[['samples']]
cat(sprintf(c('Metadata: ', metadata, '\n')))

sampleID <- snakemake@params[['sample_id']]
cat(sprintf(c('Sample ID: ', sampleID, '\n')))

Type <- snakemake@params[['linear_model']]
cat(sprintf(c('Linear Model: ', Type, '\n')))

group <- snakemake@params[['LRT']]
cat(sprintf(c('Subsetted group: ', group, '\n')))

plot_cols <- snakemake@config[['meta_columns_to_plot']]
subset_cols = names(plot_cols)

# color palette
colors <- snakemake@params[['colors']]
discrete <- snakemake@params[['discrete']]

# function to grab the ggplot2 colours
gg_color_hue <- function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

Dir <- "results/diffexp/group/"

md <- read.delim(file=metadata, sep = "\t", stringsAsFactors = FALSE)
md <- md[order(md[sampleID]),]

# Read in counts table
cts <- read.table(counts, header=TRUE, row.names=1, sep="\t", check.names=F)
cts <- cts[,order(colnames(cts))]

# Put sample IDs as rownames of metadata
rownames(md) <- md[[sampleID]]
md[[sampleID]] <- NULL

# Ensure that we subset md to have exactly the same samples as in the counts table
md <- md[colnames(cts),]
dim(md)

# Check
stopifnot(rownames(md)==colnames(cts))

# Define colours based on number of Conditions
if(colors[[1]] !='NA' & discrete[[1]] =='NA'){
    if (brewer.pal.info[colors[[1]],]$maxcolors >= length(unique(md[[Type]]))) {
        pal <- brewer.pal(length(unique(md[[Type]])),name=colors[[1]])
    } 
} else if(discrete[[1]] != 'NA' & length(discrete)==length(unique(md[[Type]]))){
        pal <- unlist(discrete)
} else {
        pal <- gg_color_hue(length(unique(md[[Type]])))
}

# Create dds object from counts data and correct columns
dds <- DESeqDataSetFromMatrix(countData=cts,
                              colData=md,
                              design= as.formula(paste('~',Type)))

# Remove uninformative columns
dds <- dds[ rowSums(counts(dds)) >= 1, ]

# Likelihood Ratio test to look at differential expression across ALL types, and not just pairs of types (contrast)
dds.lrt <- DESeq(dds, test="LRT", reduced=~1)
res.lrt <- results(dds.lrt, cooksCutoff = Inf, independentFiltering=FALSE)
head(res.lrt)

# Obtain normalized counts
rld <- rlog(dds.lrt, blind=FALSE)

# Pairwise PCA Plot
pcaData <- plotPCA(rld, intgroup=labels[[1]], returnData=TRUE)

pdf(pca_plot)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes_string("PC1", "PC2", color=labels[[1]])) +
  geom_point(size=3) +
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) +
  coord_fixed() +
  scale_colour_manual(values=pal)
dev.off()

# Pairwise PCA Plot with more than one PCA parameter
if (length(labels)>1) {
  pca_plot2 <- sub("$","twoDimensional_pca_plot.pdf", Dir)
  pcaData <- plotPCA(rld, intgroup=c(labels[[1]], labels[[2]]), returnData=TRUE)
  pdf(pca_plot2, 5, 5)
  percentVar <- round(100 * attr(pcaData, "percentVar"))
  ggplot(pcaData, aes_string("PC1", "PC2", color=labels[[1]], shape=labels[[2]])) +
    geom_point(size=3) +
    xlab(paste0("PC1: ",percentVar[1],"% variance")) +
    ylab(paste0("PC2: ",percentVar[2],"% variance")) +
    coord_fixed() +
    scale_colour_manual(values=pal)
  dev.off()
}

# SD mean plot
pdf(sd_mean_plot)
meanSdPlot(assay(rld))
dev.off()

# Heatmap of distances
pdf(distance_plot)
sampleDists <- dist(t(assay(rld)))
sampleDistMatrix <- as.matrix(sampleDists)

rownames(sampleDistMatrix) <- colnames(rld)
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix, fontsize=5, scale="row",
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         col=colors)
dev.off()

# Heatmap across all samples
# List top 50 genes for group comparisons
topGenes <- head(order(res.lrt$padj), 50)

# Extract topGenes from rld object
plot <- assay(rld)[topGenes,] #for 2+ types

# Generate data frame with samples as the rownames and single colData as the first row
# Default when we subset creates an incompatible dataframe so this is a check
df <- as.data.frame(colData(rld))
if (length(subset_cols)==1) {
  annot <- as.data.frame(cbind(rownames(df), paste(df[[subset_cols[1]]])))
  names(annot) <- c("SampleID", subset_cols[1])
  rownames(annot) <- annot[[sampleID]]
  annot[[sampleID]] <- NULL
} else {
  annot <- df[,subset_cols]
}

filt <- plot[apply(plot, MARGIN = 1, FUN = function(x) sd(x) != 0),]

pdf(heatmap_plot)
pheatmap(filt, cluster_rows=T, scale="row", fontsize=6,fontsize_row=6,fontsize_col=6,show_rownames=T, cluster_cols=T, annotation_col=annot, labels_col=as.character(rownames(df)), main = paste("Heatmap of top 50 DE genes across all samples"))
dev.off()

saveRDS(dds, file=rds_out)
saveRDS(rld, file=rld_out)

group <- as.vector(group)

# If LRT group has been specified, run the analysis for that group
if (length(group)>0) {
  md <- read.delim(file=metadata, sep = "\t", stringsAsFactors = FALSE)
  md <- md[order(md[sampleID]),]
  cts <- read.table(counts, header=TRUE, row.names=1, sep="\t")
  cts <- cts[,order(colnames(cts))]
  md <- md[md[[Type]] %in% group,]
  rownames(md) <- md[[sampleID]]
  md[[sampleID]] <- NULL
  keep <- colnames(cts)[colnames(cts) %in% rownames(md)]
  cts <- cts[, keep]
  dim(cts)
  md <- md[colnames(cts),]
  dim(md)

  dds <- DESeqDataSetFromMatrix(countData=cts,
                              colData=md,
                              design= as.formula(paste('~',Type)))
  dds <- dds[ rowSums(counts(dds)) >= 1, ]
  dds.lrt <- DESeq(dds, test="LRT", reduced=~1)
  res.lrt <- results(dds.lrt, cooksCutoff = Inf, independentFiltering=FALSE)
  rld <- rlog(dds.lrt, blind=FALSE)

  # Pairwise PCA Plot
  pdf(sub("$", "subsetted_pca_plot.pdf", Dir), 5, 5)
  plotPCA(rld, intgroup=labels[[1]])
  dev.off()
  # Pairwise PCA Plot with more than one PCA parameter
  if (length(labels)>1) {
    pcaData <- plotPCA(rld, intgroup=c(labels[[1]], labels[[2]]), returnData=TRUE)
    pdf(sub("$", "subsetted_twoDimensional_pca_plot.pdf", Dir), 5, 5)
    percentVar <- round(100 * attr(pcaData, "percentVar"))
    ggplot(pcaData, aes_string("PC1", "PC2", color=labels[[1]], shape=labels[[2]])) +
      geom_point(size=3) +
      xlab(paste0("PC1: ",percentVar[1],"% variance")) +
      ylab(paste0("PC2: ",percentVar[2],"% variance")) +
      coord_fixed()
    dev.off()
  }

  # Heatmap
  topGenes <- head(order(res.lrt$padj), 50)
  # Extract topGenes from rld object
  plot <- assay(rld)[topGenes,] #for 2+ types

  df <- as.data.frame(colData(rld))
  if (length(subset_cols)==1) {
    annot <- as.data.frame(cbind(rownames(df), paste(df[[subset_cols[1]]])))
    names(annot) <- c("SampleID", subset_cols[1])
    rownames(annot) <- annot[[sampleID]]
    annot[[sampleID]] <- NULL
  } else {
    annot <- df[,subset_cols]
  }

  pdf(sub("$", "subsetted_heatmap.pdf", Dir), 5, 5)
  pheatmap(assay(rld)[topGenes,], cluster_rows=T, scale="row", fontsize=6,fontsize_row=6,fontsize_col=6,show_rownames=T, cluster_cols=T, annotation_col=annot, labels_col=as.character(rownames(df)), main = paste("Heatmap of top 50 DE genes across selected samples"))
  dev.off()
}

R ggplot2 dplyr DESeq2 RColorBrewer PCAtools pheatmap genefilter vsn From line 1 of scripts/deseq2_group.R

library("dplyr")
library("DESeq2")

counts = snakemake@input[['counts']]

metadata <- snakemake@params[['samples']]

sampleID <- snakemake@params[['sample_id']]

Type <- snakemake@params[['linear_model']]

contrast <- snakemake@params[['contrast']]

baseline <- contrast[[2]]

target <- contrast[[1]]

output = snakemake@output[['rds']]

rld_out = snakemake@output[['rld_out']]

parallel <- FALSE
if (snakemake@threads > 1) {
    library("BiocParallel")
    # setup parallelization
    register(MulticoreParam(snakemake@threads))
    parallel <- TRUE
}

# Read in metadata table and order according to sampleID
md <- read.delim(file=metadata, sep = "\t", stringsAsFactors = FALSE)
md <- md[order(md[sampleID]),]

# Read in counts table
subdata <- read.table(counts, header=TRUE, row.names=1, sep="\t", check.names=FALSE)
subdata <- subdata[,order(colnames(subdata))]

# Extract only the Types that we want in further analysis & only the PP_ID and Status informative columns
md <- filter(md, !!as.name(Type) == baseline | !!as.name(Type) == target, !!as.name(sampleID) %in% colnames(subdata))

# Keep only the PP_IDs of the types we have chosen in the metadata table above
rownames(md) <- md[[sampleID]]
md[[sampleID]] <- NULL
keep <- colnames(subdata)[colnames(subdata) %in% rownames(md)]
subdata <- subdata[, keep]
dim(subdata)

# Check
stopifnot(rownames(md)==colnames(subdata))

# Obtain the number of genes that meet padj<0.01 for reference line in histogram
dds <- DESeqDataSetFromMatrix(countData=subdata,
                              colData=md,
                              design= as.formula(paste('~',Type)))

dds <- estimateSizeFactors(dds)

# Remove uninformative columns
dds <- dds[ rowSums(counts(dds)) > 1, ]

# Normalization and pre-processing
dds <- DESeq(dds, parallel=parallel)

saveRDS(dds, file=output)

# obtain normalized counts
rld <- rlog(dds, blind=FALSE)
saveRDS(rld, file=rld_out)

R dplyr DESeq2 BiocParallel From line 1 of scripts/deseq2-init.R

library("DESeq2")
library("pheatmap")
library("ggplot2")
library("ggrepel")

print('Setting parameters')

rds = snakemake@input[['rds']]
cat(sprintf(c('RDS object: ',rds,'\n')))

rld = snakemake@input[['rld']]
cat(sprintf(c('RLD object: ',rld,'\n')))

Type = snakemake@params[['linear_model']]
cat(sprintf(c('Type: ',Type,'\n')))

sampleID = snakemake@params[['sample_id']]
cat(sprintf(c('Sample ID: ',sampleID,'\n')))

ma_plot = snakemake@output[['ma_plot']]
cat(sprintf(c('MA plot', ma_plot,'\n')))

out_table = snakemake@output[['table']]
cat(sprintf(c('Summary results table', out_table,'\n')))

out_gene_table = snakemake@output[['geneID_table']]

panel_ma = snakemake@output[['panel_ma']]
cat(sprintf(c('MA panel', panel_ma,'\n')))

heatmap_plot = snakemake@output[['heatmap_plot']]
cat(sprintf(c('Heatmap plot', heatmap_plot,'\n')))

var_heat = snakemake@output[['var_heat']]
cat(sprintf(c('Variance Heatmap plot', var_heat,'\n')))

pca_plot = snakemake@output[['pca_plot']]
cat(sprintf(c('PCA plot', pca_plot,'\n')))

labels <- snakemake@params[['pca_labels']]
cat(sprintf(c('PCA Labels: ',labels)))

cat(sprintf('Load dds DESeqTransform object'))
dds <- readRDS(rds)

cat(sprintf('Load rlog DESeqTransform object'))
rld <- readRDS(rld)

Dir <- "results/diffexp/pairwise/"

plot_cols <- snakemake@config[['meta_columns_to_plot']]
subset_cols = names(plot_cols)

contrast <- c(Type, snakemake@params[["contrast"]])
baseline <- contrast[[3]]
target <- contrast[[2]]

upCol = "#FF9999"
downCol = "#99CCFF"
ncCol = "#CCCCCC"
adjp <- 0.01
FC <- 2

ens2geneID <- snakemake@config[['ens2geneID']]

parallel <- FALSE
if (snakemake@threads > 1) {
    library("BiocParallel")
    # setup parallelization
    register(MulticoreParam(snakemake@threads))
    parallel <- TRUE
}

# Pairwise PCA Plot
pdf(pca_plot)
plotPCA(rld, intgroup=labels[[1]])
dev.off()

# Pairwise PCA Plot with more than one PCA parameter
if (length(labels)>1) {
  pca_plot2 <- sub("$",paste(contrast[2],"vs",contrast[3],"twoDimensional_pca_plot.pdf", sep = "-"), Dir)
  pcaData <- plotPCA(rld, intgroup=c(labels[[1]], labels[[2]]), returnData=TRUE)
  pdf(pca_plot2, 5, 5)
  percentVar <- round(100 * attr(pcaData, "percentVar"))
  ggplot(pcaData, aes_string("PC1", "PC2", color=labels[[1]], shape=labels[[2]])) +
    geom_point(size=3) +
    xlab(paste0("PC1: ",percentVar[1],"% variance")) +
    ylab(paste0("PC2: ",percentVar[2],"% variance")) +
    coord_fixed()
  dev.off()
}

res <- results(dds, contrast=contrast,  independentFiltering = FALSE, cooksCutoff = Inf)
# shrink fold changes for lowly expressed genes
res <- lfcShrink(dds, contrast=contrast, res=res)

# MA plot - calc norm values yourself to plot with ggplot
# MA plot is log2normalized counts (averaged across all samples) vs. log2FC

# extract normalized counts to calculate values for MA plot
norm_counts <- counts(dds, normalized=TRUE)

## select up regulated genes
forPlot <- as.data.frame(res)
forPlot$log2Norm <- log2(rowMeans(norm_counts))
forPlot$Gene <- rownames(forPlot)

## Replace ensemble id's with gene id's
gene_id = read.delim(ens2geneID)

## Remove unique identifier .xx from heatmap data
forPlot$Gene <- sub("\\.[0-9]*", "", forPlot$Gene)
iv <- match(forPlot$Gene, gene_id$ensembl_gene_id)
head(gene_id[iv,])

## assign the rownames of plot_LG to their external gene name using a variable, where we have indexed the 
## row number for all matches between these two dataframes
## Use paste to get rid of factors of this column, and just paste the value of the gene name
forPlot$Gene  <- paste(gene_id[iv, "external_gene_name"])

up <- forPlot$padj < adjp & forPlot$log2FoldChange > log2(FC)
sum(up)

## select down regulated genes
down <- forPlot$padj < adjp & forPlot$log2FoldChange < -log2(FC)
sum(down)

# Grab the top 5 up and down regulated genes to label in the volcano plot
if (sum(up)>5) {
  temp <- forPlot[up,]
  upGenesToLabel <- head(rownames(temp[order(-temp$log2FoldChange),], 5))
} else if (sum(up) %in% 1:5) {
  temp <- forPlot[up,]
  upGenesToLabel <- rownames(temp[order(-temp$log2FoldChange),])
}

if (sum(down)>5) {
  temp <- forPlot[down,]
  downGenesToLabel <- head(rownames(temp[order(temp$log2FoldChange),], 5))
} else if (sum(down) %in% 1:5) {
  temp <- forPlot[down,]
  downGenesToLabel <- rownames(temp[order(temp$log2FoldChange),])
}

forPlot$Expression <- ifelse(down, 'down',
                  ifelse(up, 'up','NS'))
forPlot$Expression <- factor(forPlot$Expression, levels=c("up","down","NS"))

# Assign colours to conditions
if (sum(up)==0 & sum(down)==0) {
  colours <- ncCol
} else if (sum(up)==0) {
  colours <- c(downCol, ncCol)
} else if (sum(down)==0) {
  colours <- c(upCol, ncCol)
} else {
  colours <- c(upCol, downCol, ncCol)
}

# Create vector for labelling the genes based on whether genes are DE or not
if (exists("downGenesToLabel") & exists("upGenesToLabel")) {
  genesToLabel <- c(downGenesToLabel, upGenesToLabel)
} else if (exists("downGenesToLabel") & !exists("upGenesToLabel")) {
  genesToLabel <- downGenesToLabel
} else if (!exists("downGenesToLabel") & exists("upGenesToLabel")) {
  genesToLabel <- upGenesToLabel
}

if (exists("genesToLabel")) {
  maPlot <- ggplot(forPlot, mapping=aes(x=log2Norm, y=log2FoldChange, colour=Expression)) +
    geom_point() +
    geom_hline(yintercept=c(-1,1), linetype="dashed", color="black") +
    geom_label_repel(aes(label=ifelse(Gene %in% genesToLabel, as.character(Gene),'')),box.padding=0.1, point.padding=0.5, segment.color="gray70", show.legend=FALSE) +
    scale_colour_manual(values=colours) +
    ggtitle(paste(baseline, "vs", target)) +
    xlab("log2(Normalized counts)") +
    ylab("log2(Fold Change)") +
    theme(plot.title = element_text(hjust=0.5))
} else {
  maPlot <- ggplot(forPlot, mapping=aes(x=log2Norm, y=log2FoldChange, colour=Expression)) +
    geom_point() +
    geom_hline(yintercept=c(-1,1), linetype="dashed", color="black") +
    scale_colour_manual(values=colours) +
    ggtitle(paste(baseline, "vs", target)) +
    xlab("log2(Normalized counts)") +
    ylab("log2(Fold Change)") +
    theme(plot.title = element_text(hjust=0.5))
}

# MA plot
pdf(ma_plot)
print({
  maPlot
})
dev.off()

# P-histogram
p_hist = snakemake@output[['p_hist']]
pdf(p_hist)
hist(res$pvalue[res$baseMean > 1], breaks = 0:20/20, col = "grey50", border = "white", main='P values for genes with mean normalized count larger than 1',xlab='pvalue')
dev.off()

#panel ma plot
pdf(panel_ma)
par(mfrow=c(2,2),mar=c(2,2,1,1) +0.1)
ylim <- c(-2.5,2.5)
resGA <- results(dds, contrast=contrast, lfcThreshold=.5, altHypothesis="greaterAbs")
resLA <- results(dds, contrast=contrast, lfcThreshold=.5, altHypothesis="lessAbs")
resG <- results(dds, contrast=contrast, lfcThreshold=.5, altHypothesis="greater")
resL <- results(dds, contrast=contrast, lfcThreshold=.5, altHypothesis="less")
drawLines <- function() abline(h=c(-.5,.5),col="dodgerblue",lwd=2)
plotMA(resGA, ylim=ylim); drawLines()
plotMA(resLA, ylim=ylim); drawLines()
plotMA(resG, ylim=ylim); drawLines()
plotMA(resL, ylim=ylim); drawLines()
mtext(resG@elementMetadata$description[[2]], outer=T, cex=.6,line=-1)
dev.off()

# Heatmap of top 50 genes
topGenes <- head(order(res$padj),50)

df <- as.data.frame(colData(rld))
if (length(subset_cols)==1) {
  annot <- as.data.frame(cbind(rownames(df), paste(df[[subset_cols[1]]])))
  names(annot) <- c("SampleID", subset_cols[1])
  rownames(annot) <- annot$SampleID
  annot$SampleID <- NULL
} else {
  annot <- df[,subset_cols]
}

forHeat <- assay(rld)[topGenes,]

## Remove unique identifier .xx from heatmap data
rownames(forHeat) <- sub("\\.[0-9]*", "", rownames(forHeat))
iv <- match(rownames(forHeat), gene_id$ensembl_gene_id)
head(gene_id[iv,])

## assign the rownames of plot_LG to their external gene name using a variable, where we have indexed the 
## row number for all matches between these two dataframes
## Use paste to get rid of factors of this column, and just paste the value of the gene name
rownames(forHeat) <- paste(gene_id[iv, "external_gene_name"])

pdf(heatmap_plot)
pheatmap(forHeat, cluster_rows=T, scale="row", fontsize=6,fontsize_row=6,fontsize_col=6,show_rownames=T, cluster_cols=T, annotation_col=annot, labels_col=as.character(rownames(df)), main = paste("Heatmap of top 50 DE genes:", contrast[2], "vs", contrast[3]))
dev.off()

# Variance Heatmap
pdf(var_heat)
topVarGenes <- head(order(rowVars(assay(rld)), decreasing = TRUE), 50)
mat  <- assay(rld)[ topVarGenes, ]
mat  <- mat - rowMeans(mat)

## Remove unique identifier .xx from heatmap data
rownames(mat) <- sub("\\.[0-9]*", "", rownames(mat))
iv <- match(rownames(mat), gene_id$ensembl_gene_id)
head(gene_id[iv,])
rownames(mat) <- paste(gene_id[iv, "external_gene_name"])

pheatmap(mat, scale="row", annotation_col = annot,fontsize=6, main = paste("Heatmap of top 50 most variable genes:", contrast[2], "vs", contrast[3]))
dev.off()

# sort by p-value
res <- res[order(res$padj),]
write.table(as.data.frame(res), file=out_table, quote=FALSE, sep='\t')

# Export table with the geneIDs
## Remove unique identifier .xx from results table
res$GeneID <- sub("\\.[0-9]*", "", rownames(res))
iv <- match(res$GeneID, gene_id$ensembl_gene_id)
head(gene_id[iv,])

## Paste external gene name of these ensembl IDs into this column instead
## Use paste to get rid of factors of this column, and just paste the value of the gene name
res$GeneID  <- paste(gene_id[iv, "external_gene_name"])

resGen <- cbind(res$GeneID, as.data.frame(res)[,1:6])
colnames(resGen)[1] <- "GeneID"

# Export to table
write.table(resGen, file=out_gene_table, row.names=FALSE, quote=FALSE, sep="\t")

R ggplot2 DESeq2 ggrepel BiocParallel PCAtools pheatmap From line 1 of scripts/deseq2_pairwise.R

import pandas as pd

files = snakemake.input
out = snakemake.output[0]

def split_exon_fraction(files, out):
    sample_list = []
    for file in files:
        sample = file.split('.')[0]
        name = sample.split('/')[3]
        results = {}
        equal = False
        for line in open(file):
            if '==' in line:
                equal = True

            if not equal:
                line = line.split()
                key ='_'.join(line[:2])
                results[key] = float(line[-1])

            if equal:
                if not '==' in line and not'Group' in line:
                    line = line.split()
                    key = line[0]
                    results[key] = float(line[2])
                else:
                    continue
        Total = results['CDS_Exons'] + results["5'UTR_Exons"] + results["3'UTR_Exons"] + results['Introns'] + results['TSS_up_1kb'] + results['TSS_up_5kb'] + results['TSS_up_10kb'] + results['TES_down_1kb'] + results['TES_down_5kb'] + results['TES_down_10kb']
        exon_fraction = (results['CDS_Exons'] + results["5'UTR_Exons"] + results["3'UTR_Exons"]) / Total
        intron_fraction = results['Introns'] / Total
        intergenic_fraction = (results['TSS_up_1kb'] + results['TSS_up_5kb'] + results['TSS_up_10kb'] + results['TES_down_1kb'] + results['TES_down_5kb'] + results['TES_down_10kb']) / Total
        sample_list.append([name, exon_fraction, intron_fraction, intergenic_fraction])

    frame = pd.DataFrame(sample_list)
    frame.columns = ['Sample','Exon','Intron','Intergenic']
    frame.to_csv(out,sep='\t',index=False)

split_exon_fraction(files, out)

Python Pandas From line 1 of scripts/get_rd.py

library(DESeq2)
library(dplyr)
library(ggplot2)

# Generate subdata 
counts <- snakemake@input[['counts']]

metadata <- snakemake@params[['samples']]

sampleID <- snakemake@params[['sample_id']]

hist <- snakemake@output[['histogram']]

numGenes <- snakemake@output[['numGenes']]

permList <- snakemake@output[['permList']]

Type <- snakemake@params[['linear_model']]

contrast <- snakemake@params[['contrast']]

baseline <- contrast[[2]]

target <- contrast[[1]]

md <- read.delim(file=metadata, sep = "\t", stringsAsFactors = FALSE)
md <- md[order(md[[sampleID]]),]

# Read in counts table
subdata <- read.table(counts, header=TRUE, row.names=1, sep="\t")
subdata <- subdata[,order(colnames(subdata))]

# Extract only the Types that we want in further analysis & only the PP_ID and Status informative columns
md <- filter(md, !!as.name(Type) == baseline | !!as.name(Type) == target, !!as.name(sampleID) %in% colnames(subdata))

# Keep only the PP_IDs of the types we have chosen in the metadata table above
rownames(md) <- md[[sampleID]]
md[[sampleID]] <- NULL
keep <- colnames(subdata)[colnames(subdata) %in% rownames(md)]
subdata <- subdata[, keep]
dim(subdata)

# Check
stopifnot(rownames(md)==colnames(subdata))

# Get the number of Cancer samples and number of HD samples from md table
num1 = sum(md[[Type]] == baseline)
num2 = sum(md[[Type]] == target)

# Create a vector for both HD and Can, with a 1 for every HD and a 2 for every Cancer sample
One_vector = rep(c(1), times = num1)
Two_vector = rep(c(2), times = num2)

# Permutation
# Concatenate the HD and Can vector to create your "start group" vector
start_group = c(One_vector, Two_vector)
cutoff=0.01
number_of_diff_genes=c()
group_list = list()
number_of_try = 500

for (i in 1:number_of_try)
{
  print(i)
  group = data.frame(type=factor(sample(start_group)))

  dds = DESeqDataSetFromMatrix(countData = subdata,
                               colData = group,
                               design = ~ type)

  # Extract normalized counts
  dds = estimateSizeFactors(dds)

  # Remove genes with zero counts over all samples
  dds <- dds[ rowSums(counts(dds)) > 1, ]

  # Make sure of reference, set it by rlevel
  dds$type = relevel(dds$type, ref = 1)

  # The standard differential expression analysis steps are wrapped into a single function, DESeq
  dds = DESeq(dds)

  # Extract results
  res = results(dds, contrast = c("type", "1", "2"), independentFiltering = FALSE,cooksCutoff = Inf)

  tmp=sum(res$padj < cutoff, na.rm=TRUE)
  number_of_diff_genes = c(number_of_diff_genes,tmp)
  group_list[[i]] <- group

}

# Obtain the number of genes that meet padj<0.01 for reference line in histogram
dds <- DESeqDataSetFromMatrix(countData=subdata,
                              colData=md,
                              design= as.formula(paste('~',Type)))

dds <- estimateSizeFactors(dds)

# Remove uninformative columns
dds <- dds[ rowSums(counts(dds)) > 1, ]

# Normalization and pre-processing
dds <- DESeq(dds)

# Extract results and the number of significant genes with padj<0.01
results = results(dds, contrast = c(Type, target, baseline), independentFiltering = FALSE,cooksCutoff = Inf)
numSig <- sum(results$padj < cutoff, na.rm=TRUE)
number_of_diff_genes <- as.data.frame(number_of_diff_genes)
names(number_of_diff_genes) <- "NumDiffGenes"
number_of_diff_genes$Actual <- numSig

p <- ggplot(number_of_diff_genes, aes(x=NumDiffGenes)) +
  geom_histogram(bins=100) +
  geom_vline(data=number_of_diff_genes, mapping=aes(xintercept = numSig, color = "Correct Labels"), 
             linetype="longdash", size=0.6, show.legend = T) +
  scale_color_manual(values = "gray75", name = "Number of DE genes") +
  ggtitle(paste(number_of_try, "Random Permutations:", baseline, "vs", target)) +
  xlab("Number of significant genes") +
  theme(aspect.ratio=1,
        plot.title = element_text(hjust = 0.5),
        legend.title = element_text(size=10, hjust = 0.5))

pdf(hist)
print({
  p
})
dev.off()

df <- data.frame(stringsAsFactors = FALSE)

for (i in 1:number_of_try) {
  if (i==1) {
    df = group_list[[i]]
  }
  else {
    df = cbind(df, group_list[[i]])
  }
  colnames(df)[i] = paste("perm",i, sep = "_")
}

write.csv(number_of_diff_genes, numGenes)
write.csv(df, permList)

R ggplot2 dplyr From line 1 of scripts/permutation_test.R

library("DESeq2")
library("reshape2")
library("cowplot")
library("limma")
library("vsn")
library("genefilter")
library("ggplot2")
library("dplyr")
library("RColorBrewer")
library("pheatmap")
library("hexbin")

# output files
MDS_out <- snakemake@output[['mds_plot']]
MDS_table <- snakemake@output[['mds_table']]
heatmap_out <- snakemake@output[['heatmap_plot']]
sd_out <- snakemake@output[['sd_plot']]
normCounts_out <- snakemake@output[['rlogCounts_plot']]
normCounts_fac <- snakemake@output[['rlogCounts_fac_plot']]
rawCounts_out <- snakemake@output[['counts_plot']]
rawCounts_fac <- snakemake@output[['counts_fac_plot']]

# parameters
sampleID <- snakemake@params[['sample_id']]
Type = snakemake@params[['linear_model']]
plot_cols <- snakemake@config[['meta_columns_to_plot']]
subset_cols = names(plot_cols)

# color palette
colors <- snakemake@params[['colors']]
discrete <- snakemake@params[['discrete']]

# DESeq2 objects
rld <- snakemake@input[['rld']]
dds <- snakemake@input[['rds']]

rld <- readRDS(rld)
dds <- readRDS(dds)

# function to grab the ggplot2 colours
gg_color_hue <- function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

rawCounts <- counts(dds, normalized=FALSE)
md <- as.data.frame(colData(rld))
md$SampleID <- rownames(md)

if(colors[[1]] !='NA' & discrete[[1]] =='NA'){
    if (brewer.pal.info[colors[[1]],]$maxcolors >= length(unique(md[[Type]]))) {
        pal <- brewer.pal(length(unique(md[[Type]])),name=colors[[1]])
    } 
} else if(discrete[[1]] != 'NA' & length(discrete)==length(unique(md[[Type]]))){
        pal <- unlist(discrete)
} else {
        pal <- gg_color_hue(length(unique(md[[Type]])))
}


df1 <- melt(rawCounts) %>%
  dplyr::rename(Gene=Var1) %>%
  dplyr::rename(SampleID=Var2) %>%
  dplyr::rename(counts=value)

iv <- match(df1$SampleID, md$SampleID)
df1$Condition <- paste(md[iv,][[Type]])
df1$SampleID <- factor(df1$SampleID, levels=unique(md$SampleID))

# aesthetic for plots
dodge <- position_dodge(width = 0.6)
theme_update(plot.title = element_text(hjust = 0.5))

p1 <- ggplot(data=df1, mapping=aes(x=SampleID, y=counts, fill=Condition)) +
  geom_violin(width=0.7) +
  geom_boxplot(width=0.2, outlier.colour=NA, position = dodge, color="gray28") +
  scale_y_log10() +
  scale_fill_manual(values=pal) +
  theme(axis.text.x = element_text(hjust=1, angle=45, size=6))

# width of pdf to ensure all sampleIDs are visible when exported to pdf
# This was generated with a use case of 16 samples and a width of 7 fitting well, the +8 is to account for the margins
if (nrow(md)<8) {
  width <- 6
} else {
  width <- 7/24*(nrow(md)+8)
}

# raw counts boxplot
pdf(rawCounts_out, width, 5)
print({
  p1
})
dev.off()

# faceted by condition
p2 <- ggplot(data=df1, mapping=aes(x=SampleID, y=counts, fill=Condition)) +
  geom_violin(width=0.7) +
  geom_boxplot(width=0.2, outlier.colour=NA, position = dodge, color="gray28") +
  scale_y_log10() +
  scale_fill_manual(values=pal) +
  theme(axis.text.x = element_text(hjust=1, angle=45, size=4)) +
  facet_wrap(~Condition)

pdf(rawCounts_fac, 2*width, 5)
print({
  plot_grid(p1, p2)
})
dev.off()

# Run same analysis for log2-transformed normalized counts
df2 <- melt(assay(rld)) %>%
  dplyr::rename(Gene=Var1) %>%
  dplyr::rename(SampleID=Var2) %>%
  dplyr::rename(normCounts=value)

# Add Condition information to this dataframe
iv <- match(df2$SampleID, md$SampleID)
df2$Condition <- paste(md[iv,][[Type]])
df2$SampleID <- factor(df2$SampleID, levels=unique(md$SampleID))

p1 <- ggplot(data=df2, mapping=aes(x=SampleID, y=normCounts, fill=Condition)) +
  geom_violin(width=0.7) +
  geom_boxplot(width=0.2, outlier.colour=NA, position = dodge, color="gray28") +
  scale_fill_manual(values=pal) +
  theme(axis.text.x = element_text(hjust=1, angle=45, size=6)) +
  ylab("regularized log expression")

# raw counts boxplot
pdf(normCounts_out, width, 5)
print({
  p1
})
dev.off()

# faceted by condition
p2 <- ggplot(data=df2, mapping=aes(x=SampleID, y=normCounts, fill=Condition)) +
  geom_violin(width=0.7) +
  geom_boxplot(width=0.2, outlier.colour=NA, position = dodge, color="gray28") +
  scale_fill_manual(values=pal) +
  theme(axis.text.x = element_text(hjust=1, angle=45, size=4)) +
  facet_wrap(~Condition) +
  ylab("regularized log expression")

pdf(normCounts_fac, 2*width, 5)
print({
  plot_grid(p1, p2)
})
dev.off()

# Standard deviation vs. mean
ntd <- normTransform(dds)

pdf(sd_out)
meanSdPlot(assay(ntd))
dev.off()

# Generate annotation column for heatmap
if (length(subset_cols)==1) {
  annot <- as.data.frame(cbind(rownames(md), paste(md[[subset_cols[1]]])))
  names(annot) <- c("SampleID", subset_cols[1])
  rownames(annot) <- annot$SampleID
  annot$SampleID <- NULL
} else {
  annot <- md[,subset_cols]
}

# Remove rows with no standard deviation so that pheatmap executes properly
filt <- assay(rld)[apply(assay(rld), MARGIN = 1, FUN = function(x) sd(x) != 0),]

hm <- pheatmap(filt, show_rownames=F, clustering_distance_rows = "correlation", 
         clustering_distance_cols = "correlation", clustering_method = "average", 
         annotation_col = annot, scale = "row", 
         main="Unsupervised heatmap of all gene counts across samples",
         fontsize_row=4, fontsize_col=6, fontsize=8,
         color = colorRampPalette(c("navy", "white", "firebrick3"))(50))

save_pheatmap_pdf <- function(x, filename) {
  stopifnot(!missing(x))
  stopifnot(!missing(filename))
  pdf(filename)
  grid::grid.newpage()
  grid::grid.draw(x$gtable)
  dev.off()
}

save_pheatmap_pdf(hm, heatmap_out)

# use plotMA function from limma, then extract data from this variable to plot with ggplot2
p <- plotMDS(assay(rld), top = 1000)
df <- data.frame(x=p$x, y=p$y, name=names(p$x))
iv <- match(df$name, md$SampleID)
df$Condition <- paste(md[iv,][[Type]])

pdf(MDS_out)
ggplot(data=df, mapping=aes(x=x,y=y)) +
  geom_point(size=3, colour = "black", show.legend = TRUE) +
  geom_point(aes(color=Condition), size=2.2) +
  scale_colour_manual(values=pal) +
  xlab("Leading logFC dim 1") +
  ylab("Leading logFC dim 2") +
  ggtitle("MDS Plot")
dev.off()

# Export the table for MDS
write.table(df, file=MDS_table, sep="\t", quote=F, row.names=FALSE)

R ggplot2 dplyr DESeq2 cowplot RColorBrewer reshape2 pheatmap limma genefilter vsn hexbin From line 1 of scripts/QC.R

library("data.table")
library("qvalue")

stats_table <- snakemake@input[['stats_table']]
cat(sprintf(c('stats table: ', stats_table, '\n')))


qplot <- snakemake@output[['qplot']]
cat(sprintf(c('Qvalue Output: ', qplot, '\n')))

qhist <- snakemake@output[['qhist']]
cat(sprintf(c('Qvalue hist Output: ', qhist, '\n')))

out_table = snakemake@output[['table']]
cat(sprintf(c('Summary results table', out_table,'\n')))

stats_frame = read.table(stats_table, row.names=1, sep='\t', check.names=F)

qobj = qvalue(p=stats_frame$pvalue, fdr.level=T)

stats_frame$qvalues = qobj$qvalues
stats_frame$lfdr = qobj$lfdr
write.table(as.data.frame(stats_frame), file=out_table, quote=FALSE, sep='\t')

pdf(qplot)
plot(qobj)
dev.off()

pdf(qhist)
hist(qobj)
dev.off()

R data.table qvalue From line 1 of scripts/qplot.R

library(ggplot2)
library(ggrepel)

degFile = snakemake@input[['degFile']]

FC <- snakemake@params[['FC']]

adjp <- snakemake@params[['adjp']]

contrast <- snakemake@params[['contrast']]

baseline <- contrast[[2]]

target <- contrast[[1]]

volcano_plot=snakemake@output[['volcano_plot']]

upCol = "#FF9999"
downCol = "#99CCFF"
ncCol = "#CCCCCC"

ens2geneID <- snakemake@config[['ens2geneID']]

##----------load differentially expressed genes --------#
print("Loading differential expressed gene table")
print(degFile)

## check if an rda file or tab sep
deg <- read.delim(file=degFile)

head(deg)
dim(deg)

## set all NA missing p-values to 1 (NA is DESeq2 default)
deg[is.na(deg$padj), "padj"] <- 1

## select up regulated genes
up <- deg$padj < adjp & deg$log2FoldChange > log2(FC)
sum(up)

## select down regulated genes
down <- deg$padj < adjp & deg$log2FoldChange < -log2(FC)
sum(down)

## Replace ensemble id's with gene id's
gene_id = read.delim(ens2geneID)

## Remove unique identifier .xx from heatmap data
rownames(deg) <- sub("\\.[0-9]*", "", rownames(deg))
iv <- match(rownames(deg), gene_id$ensembl_gene_id)
head(gene_id[iv,])

## assign the rownames of plot_LG to their external gene name using a variable, where we have indexed the 
## row number for all matches between these two dataframes
## Use paste to get rid of factors of this column, and just paste the value of the gene name
deg$Gene <- paste(gene_id[iv, "external_gene_name"])

# Grab the top 5 up and down regulated genes to label in the volcano plot
if (sum(up)>5) {
  upGenesToLabel <- head(deg[up,]$Gene, 5)
} else if (sum(up) %in% 1:5) {
  upGenesToLabel <- deg[up,]$Gene
}

if (sum(down)>5) {
  downGenesToLabel <- head(deg[down,]$Gene, 5)
} else if (sum(down) %in% 1:5) {
  downGenesToLabel <- deg[down,]$Gene
}

## calculate the -log10(adjp) for the plot
deg$log10padj <- -log10(deg$padj)

# assign up and downregulated genes to a category so that they can be labeled in the plot
deg$Expression <- ifelse(down, 'down',
                  ifelse(up, 'up','NS'))
deg$Expression <- factor(deg$Expression, levels=c("up","down","NS"))

# Assign colours to conditions
if (sum(up)==0 & sum(down)==0) {
  colours <- ncCol
} else if (sum(up)==0) {
  colours <- c(downCol, ncCol)
} else if (sum(down)==0) {
  colours <- c(upCol, ncCol)
} else {
  colours <- c(upCol, downCol, ncCol)
}

# Set all Infinity values to max out at 500 so that all points are contained in the plot
if ("Inf" %in% deg$log10padj) {
  deg$log10padj[deg$log10padj=="Inf"] <- max(deg[is.finite(deg$log10padj),"log10padj"]) + 2
}

# Assign genes to label based on whether genes are DE or not
if (exists("downGenesToLabel") & exists("upGenesToLabel")) {
  genesToLabel <- c(downGenesToLabel, upGenesToLabel)
} else if (exists("downGenesToLabel") & !exists("upGenesToLabel")) {
  genesToLabel <- downGenesToLabel
} else if (!exists("downGenesToLabel") & exists("upGenesToLabel")) {
  genesToLabel <- upGenesToLabel
}

if (exists("genesToLabel")) {
  p <- ggplot(data=deg, mapping=aes(x=log2FoldChange, y=log10padj, colour=Expression)) +
    geom_vline(xintercept = c(-log2(FC),log2(FC)), linetype="dashed", colour="gray45") +
    geom_hline(yintercept = -log10(adjp), linetype="dashed", colour="gray45") +
    geom_label_repel(aes(label=ifelse(Gene %in% genesToLabel, as.character(Gene),'')),box.padding=0.1, point.padding=0.5, segment.color="gray70", show.legend=FALSE) +
    geom_point() +
    ylab("-log10(FDR)") +
    xlab("log2(Fold Change)") +
    ggtitle(paste(target, "vs", baseline)) +
    scale_colour_manual(values=colours) +
    theme(plot.title = element_text(hjust = 0.5, face="plain"),
          axis.title.x = element_text(size=11),
          axis.title.y = element_text(size=11),
          panel.background = element_blank(),
          axis.line = element_line(colour = "gray45"),
          legend.key = element_rect(fill = "gray96"),
          legend.text = element_text(size = 10))
} else {
  p <- ggplot(data=deg, mapping=aes(x=log2FoldChange, y=log10padj, colour=Expression)) +
    geom_vline(xintercept = c(-log2(FC),log2(FC)), linetype="dashed", colour="gray45") +
    geom_hline(yintercept = -log10(adjp), linetype="dashed", colour="gray45") +
    geom_point() +
    geom_vline(xintercept = c(-log2(FC),log2(FC)), linetype="dashed", colour="gray45") +
    geom_hline(yintercept = -log10(adjp), linetype="dashed", colour="gray45") +
    ylab("-log10(FDR)") +
    xlab("log2(Fold Change)") +
    ggtitle(paste(target, "vs", baseline)) +
    scale_colour_manual(values=colours) +
    theme(plot.title = element_text(hjust = 0.5, face="plain"),
          axis.title.x = element_text(size=11),
          axis.title.y = element_text(size=11),
          panel.background = element_blank(),
          axis.line = element_line(colour = "gray45"),
          legend.key = element_rect(fill = "gray96"),
          legend.text = element_text(size = 10))
}


pdf(volcano_plot)
print({
  p
})
dev.off()

R ggplot2 DESeq2 ggrepel From line 1 of scripts/RNAseq_makeVolcano.R

library(Glimma)
library(limma)
library(DESeq2)

project_id = snakemake@params[['project_id']]

rds = snakemake@input[['rds']]
cat(sprintf(c('RDS object: ',rds,'\n')))


out_path = file.path(getwd(),'results','diffexp')
dir.create(out_path)

rds = readRDS(rds)
groups.df = as.data.frame(colData(rds))
glMDSPlot(rds, top = 1000, groups=groups.df,path=out_path,html=paste(project_id,'mds_plot',sep='.'),launch=FALSE)

R Glimma From line 1 of scripts/run_glimma_mds.R

library(Glimma)
library(limma)
library(DESeq2)

condition = snakemake@params[['condition']]
cat(sprintf(c('Condition: ',condition,'\n')))
Type <- snakemake@config[['linear_model']]

title = snakemake@params[["contrast"]]

contrast = c(condition, snakemake@params[["contrast"]])
rds = snakemake@input[['rds']]
cat(sprintf(c('RDS object: ',rds,'\n')))

ens2geneID <- snakemake@config[['ens2geneID']]

out_path = file.path(getwd(),'results','diffexp')
dir.create(out_path)
print(out_path)
rds = readRDS(rds)
groups.df = as.data.frame(colData(rds))


#### by contrasts
contrasts_to_plot = resultsNames(rds)
res <- results(rds, contrast=contrast)
res$padj[is.na(res$padj)] = 1

rnaseq = as.data.frame(counts(rds, normalized=T))

## Replace ensemble id's with gene id's
gene_id = read.delim(ens2geneID)

## Remove unique identifier .xx from heatmap data
rownames(res) <- sub("\\.[0-9]*", "", rownames(res))
iv <- match(rownames(res), gene_id$ensembl_gene_id)
head(gene_id[iv,])

res$GeneID  <- paste(gene_id[iv, "external_gene_name"])

res <- res[order(res$padj),]
# Remove duplicated geneIDs (this will, be default, remove the ones with the higher p-values, as we have ordered by p-value above
res <- res[!duplicated(res$GeneID),]

# Also subset your rnaseq counts table by these ensembl IDs so all of the data matches
rownames(rnaseq) <- sub("\\.[0-9]*", "", rownames(rnaseq))
rnaseq <- rnaseq[rownames(rnaseq) %in% rownames(res),]
# Now match gene symbols to these ensembl IDS
iv <- match(rownames(rnaseq), gene_id$ensembl_gene_id)
head(gene_id[iv,])
# Add gene symbols as rownames
rownames(rnaseq)  <- paste(gene_id[iv, "external_gene_name"])

# Now remove old ensembl IDs from results and paste gene symbol there
rownames(res) <- res$GeneID
res$GeneID <- NULL

genes = as.data.frame(row.names(res))
colnames(genes) = 'GeneID'

status_frame = res[,c('log2FoldChange','padj')]
status_frame['status'] = 0
status_frame$padj[is.na(status_frame$padj)] = 1
status_frame[status_frame$padj<0.05 & status_frame$log2FoldChange < 0 ,'status'] = -1
status_frame[status_frame$padj<0.05 & status_frame$log2FoldChange > 0 ,'status'] = 1

title = paste(title[1],'vs',title[2],sep='-')

glMDPlot(res, anno=genes, status=status_frame$status, samples=colnames(rnaseq), 
         counts=log2(rnaseq + 0.0001), groups=groups.df[[Type]], main=strsplit(res@elementMetadata$description[2],': ')[[1]][2], 
         transform=F, side.ylab='Log2-expression',launch=FALSE,side.main='GeneID', html = paste(title,'ma_plot',sep='.'),path=out_path)

## Volcano plot
glXYPlot(x=res$log2FoldChange, y=-log10(res$pvalue), xlab="logFC", ylab="logodds",path=out_path,
         status=status_frame$status, launch=FALSE,counts=log2(rnaseq + 0.0001), groups=groups.df[[Type]], anno=genes,main=strsplit(res@elementMetadata$description[2],': ')[[1]][2],html = paste(title,'volcano_plot',sep='.'))

R Glimma From line 1 of scripts/run_glimma.R

degFile = snakemake@input[['degFile']]

assembly <- snakemake@params[['assembly']]

FC <- snakemake@params[['FC']]

adjp <- snakemake@params[['adjp']]

printTree <- snakemake@params[['printTree']]

library(GO.db)
library(topGO)
library(ggplot2)
library(RColorBrewer)
library(biomaRt)
library(GenomicFeatures)
library(Rgraphviz)

##----------load differentially expressed genes --------#
print("Loading differential expressed gene table")
print(degFile)

deg <- read.delim(file=degFile,header=TRUE,sep="\t")
rownames(deg) <- sub("\\.[0-9]*","",rownames(deg))

##---------load correct Biomart------------------------#
if (assembly == "hg19") {
    organismStr <- "hsapiens"
    geneID2GO <- get(load("/home/groups/CEDAR/anno/biomaRt/hg19.Ens_75.biomaRt.GO.geneID2GO.RData"))
    xx <- get(load("/home/groups/CEDAR/anno/biomaRt/GO.db.Term.list.rda"))
}
if (assembly == "hg38.89") {
    organismStr <- "hsapiens"
    ### to get to hg38 mappings ensembl 89!
    geneID2GO <- get(load("/home/groups/CEDAR/anno/biomaRt/hg38.Ens_89.biomaRt.GO.geneID2GO.RData"))
    xx <- get(load("/home/groups/CEDAR/anno/biomaRt/GO.db.Term.list.rda"))
}
if (assembly == "hg38.90") {
    organismStr <- "hsapiens"
    ### to get to hg38 mappings ensembl 90!
    geneID2GO <- get(load("/home/groups/CEDAR/anno/biomaRt/hg38.Ens_90.biomaRt.GO.geneID2GO.RData"))
    xx <- get(load("/home/groups/CEDAR/anno/biomaRt/GO.db.Term.list.rda"))
}
if (assembly == "mm10") {
    organismStr <- "mmusculus"
    ### to get to hg38 mappings ensembl 90!
    geneID2GO <- get(load("./anno/biomaRt/hg38.Ens_90.biomaRt.GO.external.geneID2GO.RData"))
    xx <- get(load("./anno/biomaRt/GO.db.Term.list.rda"))
}


##-----------------------------------Functions--------------------------------------#
runGO <- function(geneList,xx=xx,otype,setName){
    setLength       <- sum(as.numeric(levels(geneList))[geneList]) 
    fname           <- paste(Dir, paste(setName, otype, "GO.txt", sep="_"), sep="/")
    GOData          <- new("topGOdata", ontology=otype, allGenes=geneList, annot = annFUN.gene2GO, gene2GO = geneID2GO)
    resultFisher    <- runTest(GOData, algorithm = "classic", statistic = "fisher")## statistical test for topGO
    x               <- GenTable(GOData, classicFisher=resultFisher, topNodes=length(names(resultFisher@score)))## make go table for all terms
    x               <- data.frame(x)
    pVal            <- data.frame(pval=signif(resultFisher@score, 6)) ## get unrounded pvalue
    x$enrich        <- x$Significant/x$Expected ## calculate enrichment based on what you expect by chance
    x$p.unround     <- pVal[x$GO.ID,"pval"]## put unrounded pvalue in the table
    x$p.adj         <- signif(p.adjust(x$p.unround, method="BH"), 6)## calculate the adjusted pvalue with Benjamini & Hochberg correction
    x$log.p.adj     <- -log10(x$p.adj) ## convert adjusted p value to -log10 for plot magnitude
    #x$Term.full     <- sapply(x$GO.ID, FUN=function(n){Term(xx[[n]])}) ## get the full term name
    x <- x[order(x$GO.ID),]
    write.table(x, file=fname, sep="\t", col.names=TRUE, quote=FALSE, row.names=FALSE) ## save the table
    ## you can print the tree if you want, but since I keep the list of all of them skip
    if(printTree>0){
        printGraph(GOData,## make the tree for the go data
                   resultFisher,
                   firstSigNodes = 5,
                   fn.prefix = sub("_GO.txt$", "", fname),
                   useInfo = "all",
                   pdfSW = TRUE
                   )
    }
    return(x)  
}

## function to make barplot of -log10 adjusted pvalues colored by enrichment

drawBarplot <- function(go, ontology, setName){
    go <- go[!go$p.adj > 0.01,]
    if(nrow(go)>1){
        #go$Term.full <- make.unique(paste(sapply(strsplit(as.character(substring(go$Term.full,1,50)), "\\,"), `[`, 1)))
        go$Term <- make.unique(paste(sapply(strsplit(as.character(substring(go$Term,1,50)), "\\,"), `[`, 1)))
        print(setName)
        go <- go[with(go, order(p.adj, -enrich)),]
        ## Currently there is a discrepency between xx and x, so we only use Term right now, not Term.full
        #go$Term.full <-factor(paste(go$Term.full), levels=rev(paste(go$Term.full))) ## sort table by adjusted p-value
        go$Term <-factor(paste(go$Term), levels=rev(paste(go$Term))) ## sort table by adjusted p-value
        ptitle <- paste(ontology, setName) ## plot title
        ptitle <- gsub("^.*/","",ptitle)
        pfname <- paste(setName,ontology,"pdf",sep=".")## name of png file
        if(nrow(go) < 20 ){
            toprange <- 1:nrow(go)
        }else{
            toprange <- 1:20
        }
        top <- go[toprange,]
        col <- colorRampPalette(c("white","navy"))(16)   
        pdf(file=paste(Dir, pfname, sep="/"),height=5,width=7)
        print({
           p <- ggplot(top, aes(y=log.p.adj, x=Term, fill=enrich)) + ## ggplot barplot function
               geom_bar(stat="identity",colour="black") +
               ggtitle(ptitle) +
               xlab("") + ylab("-log10(fdr)") +
               scale_fill_gradient(low=col[2], high=col[15], name="enrichment", limits=c(0,ceiling(max(top$enrich))))+
               coord_flip()+
               theme(panel.grid.major = element_line(colour = "grey"), panel.grid.minor = element_blank(), 
                     panel.background = element_blank(), axis.line = element_line(colour = "black"))+
               theme(text = element_text(size=8),
                     axis.text.x = element_text(vjust=1,color="black",size=8),
                     axis.text.y = element_text(color="black",size=8),
                     plot.title=element_text(size=10))   
       })
       dev.off()
    }
}

print("get up genes and make geneList")
up <- deg$padj < adjp & deg$log2FoldChange >= log2(FC)
up <- unique(rownames(deg[up,]))
all <-unique(names(geneID2GO))
up.geneList <-  factor(as.integer(all %in% up))
names(up.geneList) <- all

up.setsize <- sum(as.numeric(levels(up.geneList))[up.geneList])
print("setsize for significant genes") 
up.setsize

adjplabel <- gsub("^0\\.","",adjp)
comparison <- gsub("\\.tsv$|\\.txt$|\\.rda$|\\.RData$","",degFile)

Dir <- sub("$", "/GOterms", dirname(comparison))
if(!(file.exists(Dir))) {
      dir.create(Dir,FALSE,TRUE)
}

if (up.setsize >=2) {

print("make GO table for the up genes")
#################################
go.UP.BP <- runGO(geneList=up.geneList,xx=xx,otype="BP",setName=paste(basename(comparison),"upFC",FC, "adjp", adjp, sep="."))
#go.UP.MF <- runGO(geneList=up.geneList,xx=xx,otype="MF",setName=paste(comparison,"up",sep="."))

print("make the png for the up genes")
drawBarplot(go=go.UP.BP,ontology="BP",setName=paste(basename(comparison),"upFC",FC, "adjp", adjp, sep="."))

} else {
up_out = snakemake@output[[1]]
write.table('No Significant Genes', file=up_out)
}

print("get down genes and make geneList")
dn <- deg$padj < adjp & deg$log2FoldChange <= -log2(FC)
dn <- unique(rownames(deg[dn,]))
all <-unique(names(geneID2GO))
dn.geneList <-  factor(as.integer(all %in% dn))
names(dn.geneList) <- all

dn.setsize <- sum(as.numeric(levels(dn.geneList))[dn.geneList])
print("setsize for significant genes") 
dn.setsize

if(dn.setsize >= 2){

print("make GO table for down genes")
go.DN.BP <- runGO(geneList=dn.geneList,xx=xx,otype="BP",setName=paste(basename(comparison),"downFC",FC, "adjp", adjp, sep="."))

print("make barplot for down genes")
drawBarplot(go=go.DN.BP,ontology="BP",setName=paste(basename(comparison),"downFC",FC, "adjp", adjp, sep="."))

}else{
down_out = snakemake@output[[2]]
write.table('No Significant Genes', file=down_out)
}

R ggplot2 RColorBrewer GO.db biomaRt Rgraphviz topGO From line 1 of scripts/runGOforDESeq2.R

__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


extra = snakemake.params.get("extra", "")
region = snakemake.params.get("region", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)


shell("samtools stats {extra} {snakemake.input}"
      " {region} > {snakemake.output} {log}")