Code Snippets

Easy Copy Number Analysis (EaCoN) Pipeline

renv::activate()
library(EaCoN)
library(data.table)
library(qs)
library(GenomicRanges)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(org.Hs.eg.db)
library(BiocParallel)
library(RaggedExperiment)

# -- 0.2 Parse snakemake parameters
input <- snakemake@input
params <- snakemake@params
output <- snakemake@output

# -- 0.3 Load utity functions
source(file.path("scripts", "utils.R"))

# -- 1. Load the optimal gamma for each sample
best_fits <- fread(input[[1]])

# -- 2. Find the .RDS files associated with the best fits
best_fit_files <- Map(
    function(x, y)
        grep(pattern=y, list.files(x, recursive=TRUE, full.names=TRUE), value=TRUE),
    x=file.path(params$out_dir, best_fits$sample_name),
    y=paste0(".*gamma", sprintf("%.2f", best_fits$gamma), "/.*RDS$")
)
l2r_files <- Map(
    function(x, y)
        grep(pattern=y, list.files(x, recursive=TRUE, full.names=TRUE), value=TRUE),
    x=file.path(params$out_dir, best_fits$sample_name),
    y=".*L2R/.*RDS$"
)

# -- 3. Load the best fit ASCN and L2R data and build GRanges objects

.build_granges_from_cnv <- function(ascn, l2r) {
    ascn_data <- readRDS(ascn)
    l2r_data <- readRDS(l2r)
    buildGRangesFromASCNAndL2R(ascn_data, l2r_data)
}

BPPARAM <- BiocParallel::bpparam()
BiocParallel::bpworkers(BPPARAM) <- params$nthreads
gr_list <- BiocParallel::bpmapply(.build_granges_from_cnv,
    best_fit_files, l2r_files,
    SIMPLIFY=FALSE, USE.NAMES=TRUE, BPPARAM=BPPARAM
)

# removing directory paths
names(gr_list) <- basename(names(gr_list))

# -- 4. Construct a RaggedExperiment object
ragged_exp <- as(GRangesList(gr_list), "RaggedExperiment")

# include annotated bins to summarize the RaggedExperiment with
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
genome_bins <- binReferenceGenome()
annotated_bins <- annotateGRangesWithTxDB(genome_bins, txdb=txdb)

# include annotated genes to summarized RaggedExperiment with
gene_granges <- genes(txdb)
annotated_genes <- annotateGRangesWithTxDB(gene_granges, txdb=txdb)

metadata(ragged_exp) <- list(
    annotated_genome_bins=annotated_bins,
    annotated_genes=annotated_genes,
    simplifyReduce=function(scores, ranges, qranges) {
        if (is.numeric(scores)) {
            x <- mean(scores, na.rm=TRUE)
        } else {
            count_list <- as.list(table(scores))
            x <- paste0(
                paste0(names(count_list), ":", unlist(count_list)),
                collapse=","
            )
        }
        return(x)
    }
)

# -- Save files to disk
qsave(ragged_exp, file=output[[1]], nthreads=params$nthread)

R Snakemake data.table org.Hs.eg.db TxDb.Hsapiens.UCSC.hg19.knownGene GenomicRanges EaCoN qs RaggedExperiment From line 2 of scripts/5_buildRaggedExperiment.R

cluster_rnaseq: RNA-seq pipeline.

log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

suppressMessages(library("DESeq2"))
suppressMessages(library("openxlsx"))
suppressMessages(library("org.Hs.eg.db"))
suppressMessages(library("AnnotationDbi"))

## PARALLELIZATION ##
parallel <- FALSE
if (snakemake@threads > 1) {
    suppressMessages(library("BiocParallel"))
    register(MulticoreParam(snakemake@threads))
    parallel <- TRUE
}

## SNAKEMAKE I/O ##
dds <- snakemake@input[["dds"]]

## SNAKEMAKE PARAMS ##
condition <- snakemake@params[["condition"]]
levels <- snakemake@params[["levels"]]

## CODE ##
# Get dds
dds <- readRDS(dds)

# Get results
res <- results(dds, contrast = c(condition, levels), alpha=0.05, parallel = parallel)

# Annotate the GeneSymbol and complete GeneName from the ENSEMBL Gene ID.
ensg_res <- rownames(res)
ensemblGene_DEA <- gsub("\\.[0-9]*$", "", ensg_res)

ensg_symbol <- as.data.frame(mapIds(org.Hs.eg.db, keys = ensemblGene_DEA,
                                    column = "SYMBOL", keytype = "ENSEMBL"))
colnames(ensg_symbol) <- "GeneSymbol"
ensg_genename <- as.data.frame(mapIds(org.Hs.eg.db, keys = ensemblGene_DEA,
                                      column = "GENENAME", keytype = "ENSEMBL"))
colnames(ensg_genename) <- "GeneName"
annot <- merge(ensg_symbol, ensg_genename, by = "row.names", all = TRUE)
rownames(res) <- ensemblGene_DEA
res <- merge(as.data.frame(res), annot, by.x = "row.names", by.y = 1, all = TRUE)
ENSG_res_list <- as.list(ensg_res)
names(ENSG_res_list) <- ensemblGene_DEA
rownames(res) <- ENSG_res_list[res$Row.names]
col_order <- c("GeneSymbol", "GeneName", "baseMean", "log2FoldChange",
               "lfcSE", "stat", "pvalue", "padj")
res <- res[, col_order]

# Sort by adjusted p-value
res <- res[order(res$padj, decreasing = FALSE), ]

# Save tsv
write.table(res, file = snakemake@output[["tsv"]], sep = "\t", quote = FALSE, 
            col.names = NA)

# Add Name column
res$EnsemblGeneID <- rownames(res)
res <- res[c("EnsemblGeneID", colnames(res)[1:(ncol(res)-1)])]

# Green and red styles for formatting excel
redStyle <- createStyle(fontColour = "#FF1F00", bgFill = "#F6F600")
redPlainStyle <- createStyle(fontColour = "#FF1F00")
greenStyle <- createStyle(fontColour = "#008000", bgFill = "#F6F600")
greenPlainStyle <- createStyle(fontColour = "#008000")
boldStyle <- createStyle(textDecoration = c("BOLD"))

# Excel file
wb <- createWorkbook()
sheet <- "Sheet1"
addWorksheet(wb, sheet)

# Legend 
legend <- t(data.frame(paste("Upregulated in", levels[1]),
                       paste("Downregulated in", levels[1]), "FDR=0.05"))
writeData(wb, sheet, legend[, 1])
addStyle(wb, sheet, cols = 1, rows = 1, 
         style = createStyle(fontColour = "#FF1F00", fgFill = "#F6F600"))
addStyle(wb, sheet, cols = 1, rows = 2, 
         style = createStyle(fontColour = "#008000", fgFill = "#F6F600"))
addStyle(wb, sheet, cols = 1, rows = 3, style = boldStyle)
invisible(sapply(1:3, function(i) mergeCells(wb, sheet, cols = 1:3, rows = i)))

# Reorder genes according to adjusted p-value
writeData(wb, sheet, res, startRow = 6)
addStyle(wb, sheet, cols = 1:ncol(res), rows = 6, style = boldStyle, 
         gridExpand = TRUE)
conditionalFormatting(wb, sheet, cols = 1:ncol(res), rows = 7:(nrow(res)+6),
                      rule = "AND($E7>0, $I7<0.05, NOT(ISBLANK($I7)))", style = redStyle)
conditionalFormatting(wb, sheet, cols = 1:ncol(res), rows = 7:(nrow(res)+6),
                      rule = "AND($E7>0, OR($I7>0.05, ISBLANK($I7)))", style = redPlainStyle)
conditionalFormatting(wb, sheet, cols = 1:ncol(res), rows = 7:(nrow(res)+6),
                      rule = "AND($E7<0, $I7<0.05, NOT(ISBLANK($I7)))", style = greenStyle)
conditionalFormatting(wb, sheet, cols = 1:ncol(res), rows = 7:(nrow(res)+6),
                      rule = "AND($E7<0, OR($I7>0.05, ISBLANK($I7)))", style = greenPlainStyle)
setColWidths(wb, sheet, 1:ncol(res), widths = 13)

# Save excel
saveWorkbook(wb, file = snakemake@output[["xlsx"]], overwrite = TRUE)



# GET SHRUNKEN LOG FOLD CHANGES.
coef <- paste0(c(condition, levels[1], "vs", levels[2]), collapse = "_")
res_shrink <- lfcShrink(dds, coef=coef, type="apeglm")

# Annotate the GeneSymbol and complete GeneName from the ENSEMBL Gene ID.
ensg_res <- rownames(res_shrink)
ensemblGene_DEA <- gsub("\\.[0-9]*$", "", ensg_res)

ensg_symbol <- as.data.frame(mapIds(org.Hs.eg.db, keys = ensemblGene_DEA,
                                    column = "SYMBOL", keytype = "ENSEMBL"))
colnames(ensg_symbol) <- "GeneSymbol"
ensg_genename <- as.data.frame(mapIds(org.Hs.eg.db, keys = ensemblGene_DEA,
                                      column = "GENENAME", keytype = "ENSEMBL"))
colnames(ensg_genename) <- "GeneName"
annot <- merge(ensg_symbol, ensg_genename, by = "row.names", all = TRUE)
rownames(res_shrink) <- ensemblGene_DEA
res_shrink <- merge(as.data.frame(res_shrink), annot, by.x = "row.names", by.y = 1, all = TRUE)
ENSG_res_list <- as.list(ensg_res)
names(ENSG_res_list) <- ensemblGene_DEA
rownames(res_shrink) <- ENSG_res_list[res_shrink$Row.names]
col_order <- c("GeneSymbol", "GeneName", "baseMean", "log2FoldChange",
               "lfcSE", "pvalue", "padj")
res_shrink <- res_shrink[, col_order]

# Sort by adjusted p-value
res_shrink <- res_shrink[order(res_shrink$padj, decreasing = FALSE), ]

# Save tsv
write.table(res_shrink, file = snakemake@output[["tsv_lfcShrink"]], sep = "\t", quote = FALSE, 
            col.names = NA)

# Add Name column
res_shrink$EnsemblGeneID <- rownames(res_shrink)
res_shrink <- res_shrink[c("EnsemblGeneID", colnames(res_shrink)[1:(ncol(res_shrink)-1)])]

# Green and red styles for formatting excel
redStyle <- createStyle(fontColour = "#FF1F00", bgFill = "#F6F600")
redPlainStyle <- createStyle(fontColour = "#FF1F00")
greenStyle <- createStyle(fontColour = "#008000", bgFill = "#F6F600")
greenPlainStyle <- createStyle(fontColour = "#008000")
boldStyle <- createStyle(textDecoration = c("BOLD"))

# Excel file
wb <- createWorkbook()
sheet <- "Sheet1"
addWorksheet(wb, sheet)

# Legend 
legend <- t(data.frame(paste("Upregulated in", levels[1]),
                       paste("Downregulated in", levels[1]), "FDR=0.05"))
writeData(wb, sheet, legend[, 1])
addStyle(wb, sheet, cols = 1, rows = 1, 
         style = createStyle(fontColour = "#FF1F00", fgFill = "#F6F600"))
addStyle(wb, sheet, cols = 1, rows = 2, 
         style = createStyle(fontColour = "#008000", fgFill = "#F6F600"))
addStyle(wb, sheet, cols = 1, rows = 3, style = boldStyle)
invisible(sapply(1:3, function(i) mergeCells(wb, sheet, cols = 1:3, rows = i)))

# Reorder genes according to adjusted p-value
writeData(wb, sheet, res_shrink, startRow = 6)
addStyle(wb, sheet, cols = 1:ncol(res_shrink), rows = 6, style = boldStyle, 
         gridExpand = TRUE)
conditionalFormatting(wb, sheet, cols = 1:ncol(res_shrink), rows = 7:(nrow(res_shrink)+6),
                      rule = "AND($E7>0, $H7<0.05, NOT(ISBLANK($H7)))", style = redStyle)
conditionalFormatting(wb, sheet, cols = 1:ncol(res_shrink), rows = 7:(nrow(res_shrink)+6),
                      rule = "AND($E7>0, OR($H7>0.05, ISBLANK($H7)))", style = redPlainStyle)
conditionalFormatting(wb, sheet, cols = 1:ncol(res_shrink), rows = 7:(nrow(res_shrink)+6),
                      rule = "AND($E7<0, $H7<0.05, NOT(ISBLANK($H7)))", style = greenStyle)
conditionalFormatting(wb, sheet, cols = 1:ncol(res_shrink), rows = 7:(nrow(res_shrink)+6),
                      rule = "AND($E7<0, OR($H7>0.05, ISBLANK($H7)))", style = greenPlainStyle)
setColWidths(wb, sheet, 1:ncol(res_shrink), widths = 13)

# Save excel
saveWorkbook(wb, file = snakemake@output[["xlsx_lfcShrink"]], overwrite = TRUE)

R DESeq2 org.Hs.eg.db BiocParallel openxlsx AnnotationDbi From line 1 of scripts/deseq2_diffexp.R

A pipeline to connect GWAS Variant-to-Gene-to-Program (V2G2P) Approach

library(conflicted)
conflict_prefer("combine", "dplyr")
conflict_prefer("select","dplyr") # multiple packages have select(), prioritize dplyr
conflict_prefer("melt", "reshape2") 
conflict_prefer("slice", "dplyr")
conflict_prefer("Position", "ggplot2")
conflict_prefer("first", "dplyr")
conflict_prefer("select","dplyr") # multiple packages have select(), prioritize dplyr
conflict_prefer("melt", "reshape2") 
conflict_prefer("slice", "dplyr")
conflict_prefer("summarize", "dplyr")
conflict_prefer("filter", "dplyr")
conflict_prefer("list", "base")
conflict_prefer("desc", "dplyr")
conflict_prefer("rename", "dplyr")

suppressPackageStartupMessages({
    library(optparse)
    library(dplyr)
    library(tidyr)
    library(reshape2)
    library(ggplot2)
    library(ggpubr) ## ggarrange
    library(gplots) ## heatmap.2
    library(ggrepel)
    library(readxl)
    library(xlsx) ## might not need this package
    library(writexl)
    library(org.Hs.eg.db)
})


option.list <- list(
    make_option("--sampleName", type="character", default="2kG.library", help="Name of the sample"),
    make_option("--outdir", type="character", default="/oak/stanford/groups/engreitz/Users/kangh/TeloHAEC_Perturb-seq_2kG/211206_ctrl_only_snakemake/analysis/all_genes/2kG.library.ctrl.only/K25/threshold_0_2/", help="Output directory"),
    make_option("--scratch.outdir", type="character", default="", help="Scratch space for temporary files"),
    make_option("--K.val", type="numeric", default=60, help="K value to analyze"),
    make_option("--density.thr", type="character", default="0.2", help="concensus cluster threshold, 2 for no filtering"),
    ## make_option("--cell.count.thr", type="numeric", default=2, help="filter threshold for number of cells per guide (greater than the input number)"),
    ## make_option("--guide.count.thr", type="numeric", default=1, help="filter threshold for number of guide per perturbation (greater than the input number)"),
    make_option("--perturbSeq", type="logical", default=TRUE, help="Whether this is a Perturb-seq experiment")
)
opt <- parse_args(OptionParser(option_list=option.list))



## ## K562 gwps 2k overdispersed genes
## ## opt$figdir <- "/oak/stanford/groups/engreitz/Users/kangh/TeloHAEC_Perturb-seq_2kG/230104_snakemake_WeissmanLabData/figures/top2000VariableGenes/WeissmanK562gwps/K90/"
## opt$outdir <- "/oak/stanford/groups/engreitz/Users/kangh/TeloHAEC_Perturb-seq_2kG/230104_snakemake_WeissmanLabData/analysis/top2000VariableGenes/WeissmanK562gwps/K90/threshold_0_2/"
## opt$K.val <- 90
## opt$sampleName <- "WeissmanK562gwps"
## opt$perturbSeq <- TRUE
## opt$scratch.outdir <- "/scratch/groups/engreitz/Users/kangh/Perturb-seq_CAD/230104_snakemake_WeissmanLabData/top2000VariableGenes/K90/analysis/comprehensive_program_summary/"
## opt$barcodeDir <- "/oak/stanford/groups/engreitz/Users/kangh/TeloHAEC_Perturb-seq_2kG/230104_snakemake_WeissmanLabData/data/K562_gwps_raw_singlecell_01_metadata.txt"

## OUTDIR <- "/oak/stanford/groups/engreitz/Users/kangh/TeloHAEC_Perturb-seq_2kG/220316_regulator_topic_definition_table/outputs/"
## FIGDIR <- "/oak/stanford/groups/engreitz/Users/kangh/TeloHAEC_Perturb-seq_2kG/220316_regulator_topic_definition_table/figures/"
## SCRATCHOUTDIR <- "/scratch/groups/engreitz/Users/kangh/TeloHAEC_Perturb-seq_2kG/220316_regulator_topic_definition_table/outputs/"
OUTDIR <- opt$outdir
SCRATCHOUTDIR <- opt$scratch.outidr
check.dir <- c(OUTDIR, SCRATCHOUTDIR)
invisible(lapply(check.dir, function(x) { if(!dir.exists(x)) dir.create(x, recursive=T) }))

mytheme <- theme_classic() + theme(axis.text = element_text(size = 12), axis.title = element_text(size = 16), plot.title = element_text(hjust = 0.5, face = "bold"))
palette = colorRampPalette(c("#38b4f7", "white", "red"))(n = 100)



## parameters
## OUTDIRSAMPLE <- "/oak/stanford/groups/engreitz/Users/kangh/TeloHAEC_Perturb-seq_2kG/210707_snakemake_maxParallel/analysis/2kG.library/all_genes/2kG.library/K60/threshold_0_2/"
OUTDIRSAMPLE <- opt$outdir
k <- opt$K.val 
SAMPLE <- opt$sampleName
DENSITY.THRESHOLD <- gsub("\\.","_", opt$density.thr)
## SUBSCRIPT=paste0("k_", k,".dt_",DENSITY.THRESHOLD,".minGuidePerPtb_",opt$guide.count.thr,".minCellPerGuide_", opt$cell.count.thr)
SUBSCRIPT.SHORT=paste0("k_", k, ".dt_", DENSITY.THRESHOLD)

## ## load ref table (for Perturbation distance to GWAS loci annotation in Perturb_plus column)
## ref.table <- read.delim(file="/oak/stanford/groups/engreitz/Users/kangh/TeloHAEC_Perturb-seq_2kG/data/ref.table.txt", header=T, check.names=F, stringsAsFactors=F)

## ## load test results
## all.test.combined.df <- read.delim(paste0("/oak/stanford/groups/engreitz/Users/kangh/TeloHAEC_Perturb-seq_2kG/220312_compare_statistical_test/outputs/all.test.combined.df.txt"), stringsAsFactors=F)
## all.test.MAST.df <- all.test.combined.df %>% subset(test.type == "batch.correction")

## MAST.df.4n.input <- read.delim(paste0("/oak/stanford/groups/engreitz/Users/kangh/TeloHAEC_Perturb-seq_2kG/220217_MAST/2kG.library4n3.99x_MAST.txt"), stringsAsFactors=F, check.names=F)
## MAST.df.4n <- MAST.df.4n.input %>% ## remove multiTarget entries
##     subset(!grepl("multiTarget", perturbation)) %>%
##     group_by(zlm.model.name) %>%
##     mutate(fdr.across.ptb = p.adjust(`Pr(>Chisq)`, method="fdr")) %>%
##     ungroup() %>%
##     subset(zlm.model.name == "batch.correction") %>%
##     select(-zlm.model.name) %>%
##     as.data.frame
## colnames(MAST.df.4n) <- c("Topic", "p.value", "log2FC", "log2FC.ci.hi", "log2fc.ci.lo", "fdr", "Perturbation", "fdr.across.ptb")

## Load Regulator Data
if(opt$perturbSeq) {
    MAST.file.name <- paste0(OUTDIR, "/", SAMPLE, "_MAST_DEtopics.txt")
    message(paste0("loading ", MAST.file.name))
    MAST.df <- read.delim(MAST.file.name, stringsAsFactors=F, check.names=F) %>% rename("Perturbation" = "perturbation") %>%
        rename("log2FC" = "coef", "log2FC.ci.hi" = ci.hi, "log2FC.ci.lo" = ci.lo, "p.value" = "Pr(>Chisq)")
    if(grepl("topic", MAST.df$primerid) %>% sum > 0) MAST.df <- MAST.df %>% mutate(ProgramID = paste0("K", k, "_", gsub("topic_", "", primerid))) %>% as.data.frame    
}



## ## 2n MAST
## file.name <- paste0("/oak/stanford/groups/engreitz/Users/kangh/TeloHAEC_Perturb-seq_2kG/211116_snakemake_dup4_cells/analysis/all_genes//Perturb_2kG_dup4/acrossK/aggregated.outputs.findK.perturb-seq.RData")
## load(file.name)
## MAST.df.2n <- MAST.df %>%
##     filter(K == 60) %>%
##     select(-K) %>%
##     select(-zlm.model.name)
## colnames(MAST.df.2n) <- c("Topic", "p.value", "log2FC", "log2FC.ci.hi", "log2fc.ci.lo", "fdr", "Perturbation", "fdr.across.ptb")



## load gene annotations (Refseq + Uniprot)
gene.summaries.path <- "/oak/stanford/groups/engreitz/Users/kangh/TeloHAEC_Perturb-seq_2kG/data/combined.gene.summaries.txt"
gene.summary <- read.delim(gene.summaries.path, stringsAsFactors=F)


## load topic model results
cNMF.result.file <- paste0(OUTDIRSAMPLE,"/cNMF_results.",SUBSCRIPT.SHORT, ".RData")
print(cNMF.result.file)
if(file.exists(cNMF.result.file)) {
    print("loading cNMF result file")
    load(cNMF.result.file)
}



## modify theta.zscore if Gene is in ENSGID
db <- ifelse(grepl("mouse", SAMPLE), "org.Mm.eg.db", "org.Hs.eg.db")
gene.ary <- theta.zscore %>% rownames
if(grepl("^ENSG", gene.ary) %>% as.numeric %>% sum == nrow(theta.zscore)) {
    GeneSymbol.ary <- mapIds(get(db), keys=gene.ary, keytype = "ENSEMBL", column = "SYMBOL")
    GeneSymbol.ary[is.na(GeneSymbol.ary)] <- row.names(theta.zscore)[is.na(GeneSymbol.ary)]
    rownames(theta.zscore) <- GeneSymbol.ary
}

## omega.4n <- omega
## theta.zscore.4n <- theta.zscore
## theta.raw.4n <- theta.raw
meta_data <- read.delim(opt$barcodeDir, stringsAsFactors=F) 

## ## batch topics
## batch.topics.4n <- read.delim(file=paste0(OUTDIRSAMPLE, "/batch.topics.txt"), stringsAsFactors=F) %>% as.matrix %>% as.character

ann.omega <- merge(meta_data, omega, by.x="CBC", by.y=0, all.T)


##########################################################################################
## create table
create_topic_definition_table <- function(theta.zscore, t) {
    out <- theta.zscore[,t] %>%
        as.data.frame %>%
        `colnames<-`(c("zscore")) %>%
        mutate(Perturbation = rownames(theta.zscore), .before="zscore") %>%
        merge(gene.summary, by.x="Perturbation", by.y="Gene", all.x=T) %>%
        arrange(desc(zscore)) %>%
        mutate(Rank = 1:n(), .before="Perturbation") %>%
        mutate(ProgramID = paste0("K", k, "_", t), .before="zscore") %>%
        arrange(Rank) %>%
        mutate(My_summary = "", .after = "zscore") %>%
        select(Rank, ProgramID, Perturbation, zscore, My_summary, FullName, Summary)
}

create_topic_regulator_table <- function(all.test, program.here, fdr.thr = 0.1) {
    out <- MAST.df %>%
        subset(ProgramID == program.here &
               fdr.across.ptb < fdr.thr) %>%
        select(Perturbation, fdr.across.ptb, log2FC, log2FC.ci.hi, log2FC.ci.lo, fdr, p.value) %>%
        merge(gene.summary, by.x="Perturbation", by.y="Gene", all.x=T) %>%
        arrange(fdr.across.ptb, desc(log2FC)) %>%
        mutate(Rank = 1:n(), .before="Perturbation") %>%
        mutate(My_summary = "", .after="Perturbation") %>%
        mutate(ProgramID = program.here, .after="Rank") %>%
        ## merge(., ref.table %>% select("Symbol", "TSS.dist.to.SNP", "GWAS.classification"), by.x="Perturbation", by.y="Symbol", all.x=T) %>%
        ## mutate(EC_ctrl_text = ifelse(.$GWAS.classification == "EC_ctrls", "(+)", "")) %>%
        ## mutate(GWAS.class.text = ifelse(grepl("CAD", GWAS.classification), paste0("_", floor(TSS.dist.to.SNP/1000),"kb"),
        ##                          ifelse(grepl("IBD", GWAS.classification), paste0("_", floor(TSS.dist.to.SNP/1000),"kb_IBD"), ""))) %>%
        ## mutate(Perturb_plus = paste0(Perturbation, GWAS.class.text, EC_ctrl_text)) %>%
    select(Rank, ProgramID, Perturbation, fdr.across.ptb, log2FC, My_summary, FullName, Summary, log2FC.ci.hi, log2FC.ci.lo, fdr, p.value) %>% ## removed Perturb_plus
        arrange(Rank)
}



create_summary_table <- function(ann.omega, theta.zscore, all.test, meta_data) {
    df.list <- vector("list", k)
    for (t in 1:k) {
        program.here <- paste0("K", k, "_", t)

        ## topic defining genes
        ann.theta.zscore <- theta.zscore %>% create_topic_definition_table(t)
        ann.top.theta.zscore <- ann.theta.zscore %>% subset(Rank <= 100) ## select the top 100 topic defining genes to output

        ## regulators
        if(opt$perturbSeq) regulator.MAST.df <- all.test %>% create_topic_regulator_table(program.here, 0.3)

        ## write table to scratch dir
        file.name <- paste0(SCRATCHOUTDIR, program.here, "_table.csv")
        sink(file=file.name) ## open the document
        ## cat("Author,PERTURBATIONS SUMMARIES\n\"\n\"\n\"\n\"\n\"\n\"\n\"\n\n\n\nAuthor,TOPIC SUMMARIES\n\"\n\"\n\"\n\"\n\"\n\"\n\"\n\nAuthor,TESTABLE HYPOTHESIS IDEAS:\n\"\n\"\n\"\n\"\n\"\n\"\n\"\n\nAuthor,OTHER THOUGHTS:\n\"\n\"\n\"\n\"\n\"\n\"\n\"\n\nTOPIC DEFINING GENES (TOP 100),\n") ## headers
        cat("Author,PERTURBATIONS SUMMARIES,,,,,,,,,,,\n\"\n\"\n\"\n\"\n\"\n\"\n\"\n\"\nAuthor,TOPIC SUMMARIES\n\"\n\"\n\"\n\"\n\"\n\"\n\"\n\"\nAuthor,TESTABLE HYPOTHESIS IDEAS:\n\"\n\"\n\"\n\"\n\"\n\"\n\"\n\"\nAuthor,OTHER THOUGHTS:\n\"\n\"\n\"\n\"\n\"\n\"\n\"\n\"\n\nTOPIC DEFINING GENES (TOP 100),\n") ## headers
        write.csv(ann.top.theta.zscore, row.names=F) ## topic defining genes
        cat("\n\"\n\"\nPERTURBATIONS REGULATING TOPIC AT FDR < 0.3 (most significant on top),,,,,,,,,,,\n") ## headers

        if(opt$perturbSeq) write.csv(regulator.MAST.df, row.names=F) ## regulators
        sink() ## close the document

        ## read the assembled table to save to list
        df.list[[t]] <- read.delim(file.name, stringsAsFactors=F, check.names=F, sep=",")
    }


    ## output to xlsx
    names(df.list) <- paste0("Program ", 1:k)
    write_xlsx(df.list, paste0(OUTDIR, "/", SAMPLE, "_k_", k, ".dt_", DENSITY.THRESHOLD, "_ComprehensiveProgramSummary.xlsx"))

    return(df.list)
}


if(opt$perturbSeq == "F") MAST.df <- data.frame()
df <- create_summary_table(ann.omega, theta.zscore, MAST.df, meta_data)

R ggplot2 dplyr tidyr org.Hs.eg.db ggrepel optparse ggpubr reshape2 readxl gplots xlsx From line 5 of scripts/create_comprehensive_program_summary_table.R

A Snakemake based modular Workflow that facilitates RNA-Seq analyses with a special focus on splicing

library("BiocParallel")

# Load data
library("tximeta")	# Import transcript quantification data from Salmon
library("tximport")	# Import transcript-level quantification data from Kaleidoscope, Sailfish, Salmon, Kallisto, RSEM, featureCounts, and HTSeq
library("rhdf5")
library("SummarizedExperiment")

library("magrittr")	# Pipe operator
library("DESeq2")		# Differential gene expression analysis

# Plotting libraries
library("pheatmap")
library("RColorBrewer")
library("ggplot2")

# Mixed
library("PoiClaClu")
library("glmpca")
library("apeglm")
library("genefilter")
library("AnnotationDbi")
library("org.Hs.eg.db")


# ---------------- Loading read/fragment quantification data from Salmon output ----------------
load_in_salmon_generated_counts <- function(annotation_table_file) {
	# ----------------- 1. Load annotation -----------------
	# Columns: 1. names, 2. files, 3. condition, 4. additional information
	annotation_table <- read.csv(file=annotation_table_file, sep="\t")

	annotation_data <- data.frame(
									names=annotation_table[,"sample_name"],
									files=file.path(annotation_table[,"salmon_results_file"]),
									condition=annotation_table[,"condition"],
									add_info=annotation_table[,"additional_comment"]
									)
	# Replace None in condition column with "Control"
	annotation_data$condition[annotation_data$condition=="None"] <- "Control"
	annotation_data$condition[annotation_data$condition==""] <- "Control"

	# ----------------- 2. Load into Bioconductor experiment objects -----------------
	# Summarized experiment: Imports quantifications & metadata from all samples -> Each row is a transcript
	se <- tximeta(annotation_data)

	# Summarize transcript-level quantifications to the gene level -> reduces row number: Each row is a gene
	# Includes 3 matrices:
	# 1. counts: Estimated fragment counts per gene & sample
	# 2. abundance: Estimated transcript abundance in TPM
	# 3. length: Effective Length of each gene (including biases as well as transcript usage)
	gse <- summarizeToGene(se)

	# ----------------- 3. Load experiments into DESeq2 object -----------------
	# SummarizedExperiment
	# assayNames(gse)   		# Get all assays -> counts, abundance, length, ...
	# head(assay(gse), 3)     	# Get count results for first 3 genes
	# colSums(assay(gse))     	# Compute sums of mapped fragments
	# rowRanges(gse)          	# Print rowRanges: Ranges of individual genes
	# seqinfo(rowRanges(gse))   # Metadata of sequences (chromosomes in our case)

	gse$condition <- as.factor(gse$condition)
	gse$add_info <- as.factor(gse$add_info)

	# Use relevel to make sure untreated is listed first
	gse$condition %<>% relevel("Control")   # Concise way of saying: gse$condition <- relevel(gse$condition, "Control")

	# Construct DESeqDataSet from gse
	if (gse$add_info %>% unique %>% length >1) {
		# Add info column with more than 1 unique value
		print("More than 1 unique value in add_info column")
		# TODO Need to make sure to avoid:
		# the model matrix is not full rank, so the model cannot be fit as specified.
		#   One or more variables or interaction terms in the design formula are linear
		#   combinations of the others and must be removed.
		# dds <- DESeqDataSet(gse, design = ~condition + add_info)
		print("However, simple DESeq2 analysis will be performed without add_info column")
		dds <- DESeqDataSet(gse, design = ~condition)

	} else {
		print("Only 1 unique value in add_info column")
		dds <- DESeqDataSet(gse, design = ~condition)
	}

	return(dds)
}


# ---------------- Loading read/fragment quantification data from RSEM output ----------------
load_in_rsem_generated_counts <- function(annotation_table_file) {
	# ----------------- 1. Load annotation -----------------
	annotation_table <- read.csv(file=annotation_table_file, sep="\t")
	files <- file.path(annotation_table[,"rsem_results_file"])
	# For sample.genes.results: txIn= FALSE & txOut= FALSE
	# For sample.isoforms.results: txIn= TRUE & txOut= TRUE
	# Check: https://bioconductor.org/packages/devel/bioc/vignettes/tximport/inst/doc/tximport.html
	txi.rsem <- tximport(files, type = "rsem", txIn = FALSE, txOut = FALSE)

	annotation_data <- data.frame(condition=factor(annotation_table[,"condition"]),
									add_info=factor(annotation_table[,"additional_comment"])
						)
	rownames(annotation_data) <- annotation_table[,"sample_name"]

	# Construct DESeqDataSet from tximport
	if (annotation_data$add_info %>% unique %>% length >1) {
		# Add info column with more than 1 unique value
		# dds <- DESeqDataSetFromTximport(txi.rsem, annotation_data, ~condition + add_info)
		dds <- DESeqDataSetFromTximport(txi.rsem, annotation_data, ~condition)
	} else {
		dds <- DESeqDataSetFromTximport(txi.rsem, annotation_data, ~condition)
	}
	return(dds)
}


load_in_kallisto_generated_counts <- function(annotation_table_file) {
	# ----------------- 1. Load annotation -----------------
	annotation_table <- read.csv(file=annotation_table_file, sep="\t")

	files <- file.path(annotation_table[,"kallisto_results_file"])
	txi.kallisto <- tximport(files, type = "kallisto", txOut = TRUE)

	annotation_data <- data.frame(condition=factor(annotation_table[,"condition"]),
									add_info=factor(annotation_table[,"additional_comment"])
						)
	rownames(annotation_data) <- annotation_table[,"sample_name"]

	# Construct DESeqDataSet from tximport
	if (annotation_data$add_info %>% unique %>% length >1) {
		# Add info column with more than 1 unique value
		# dds <- DESeqDataSetFromTximport(txi.kallisto, annotation_data, ~condition + add_info)
		dds <- DESeqDataSetFromTximport(txi.kallisto, annotation_data, ~condition)
	} else {
		dds <- DESeqDataSetFromTximport(txi.kallisto, annotation_data, ~condition)
	}
	return(dds)
}


# ----------------- Main function -----------------
main_function <- function(){
	threads <- snakemake@threads[[1]]
	register(MulticoreParam(workers=threads))

	# Snakemake variables
	annotation_table_file <- snakemake@input[["annotation_table_file"]]
	output_file <- snakemake@output[["deseq_dataset_r_obj"]]
	count_algorithm <- snakemake@params[["count_algorithm"]]

	# Load annotation table & Salmon data into a DESeq2 object
	if (count_algorithm == "salmon") {
		dds <- load_in_salmon_generated_counts(annotation_table_file)
	} else if (count_algorithm == "kallisto") {
		dds <- load_in_kallisto_generated_counts(annotation_table_file)
	} else if (count_algorithm == "rsem") {
		dds <- load_in_rsem_generated_counts(annotation_table_file)
	} else {
		stop("Count algorithm not supported!")
	}

	# Remove rows that have no or nearly no information about the amount of gene expression
	print(paste(c("Number of rows before filtering out counts with values <1", nrow(dds))))
	keep <- rowSums(counts(dds)) > 1 # Counts have to be greater than 1
	dds <- dds[keep,]
	print(paste(c("Number of rows after filtering out counts with values <1", nrow(dds))))

    # Save deseq dataset object
    saveRDS(dds, output_file)
}


# ----------------- Run main function -----------------
main_function()

R ggplot2 DESeq2 Salmon org.Hs.eg.db kallisto magrittr RColorBrewer BiocParallel RSEM pheatmap HTSeq tximport SummarizedExperiment AnnotationDbi genefilter rhdf5 tximeta apeglm PoiClaClu From line 9 of deseq2/create_deseq_dataset_object.R

library("BiocParallel")

# Load data
library("tximeta")	# Import transcript quantification data from Salmon
library("tximport")	# Import transcript-level quantification data from Kaleidoscope, Sailfish, Salmon, Kallisto, RSEM, featureCounts, and HTSeq
library("rhdf5")
library("SummarizedExperiment")

library("magrittr")	# Pipe operator
library("DESeq2")		# Differential gene expression analysis

# Plotting libraries
library("pheatmap")
library("RColorBrewer")
library("ggplot2")

# Mixed
library("PoiClaClu")
library("glmpca")
library("apeglm")
library("genefilter")
library("AnnotationDbi")
library("org.Hs.eg.db")



# ---------------- DESeq2 explorative analysis ----------------
run_deseq2_explorative_analysis <- function(dds, output_files) {

	# ----------- 4.2 Variance stabilizing transformation and the rlog -------------
	# Problem: PCA depends mostly on points with highest variance
	# -> For gene-counts: Genes with high expression values, and therefore high variance
	# are the ones the PCA is mostly depending on
	# Solution: Apply stabilizing transformation to variance
	# -> transform data, so it becomes more homoskedastic (expected amount of variance the same across different means)
	# 1. Variance stabilizing transformation: VST-function -> fast for large datasets (> 30n)
	# 2. Regularized-logarithm transformation or rlog -> Works well on small datasets (< 30n)

	# The transformed values are no longer counts, and are stored in the assay slot.
	# 1. VST
	# transformed_dds <- vst(dds, blind = FALSE)
	# head(assay(transformed_dds), 3)

	# 2. rlog
	transformed_dds <- rlog(dds, blind=FALSE)

	# ----------- A. Sample distances -------------
	# ----------- A.1 Euclidian distances -------------
	# Sample distances -> Assess overall similarity between samples
	# dist: takes samples as rows and genes as columns -> we need to transpose
	sampleDists <- dist(t(assay(transformed_dds)))

	# Heatmap of sample-to-sample distances using the transformed values
	# Uses euclidian distance between samples
	sampleDistMatrix <- as.matrix(sampleDists)
	rownames(sampleDistMatrix) <- paste(transformed_dds$names, transformed_dds$condition, sep = " - " )
	colnames(sampleDistMatrix) <- paste(transformed_dds$names, transformed_dds$condition, sep = " - " )
	colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
	jpeg(output_files[1], width=800, height=800)
	pheatmap(sampleDistMatrix,
			 clustering_distance_rows = sampleDists,
			 clustering_distance_cols = sampleDists,
			 col = colors,
			 main = "Heatmap of sample-to-sample distances (Euclidian) after normalization")
	dev.off()


	# ----------- A.2 Poisson distances -------------
	# Use Poisson distance
	# -> takes the inherent variance structure of counts into consideration
	# The PoissonDistance function takes the original count matrix (not normalized) with samples as rows instead of
	# columns -> so we need to transpose the counts in dds.
	poisd <- PoissonDistance(t(counts(dds)))

	# heatmap
	samplePoisDistMatrix <- as.matrix(poisd$dd)
	rownames(samplePoisDistMatrix) <- paste(transformed_dds$names, transformed_dds$condition, sep=" - ")
	colnames(samplePoisDistMatrix) <- paste(transformed_dds$names, transformed_dds$condition, sep=" - ")
	jpeg(output_files[2], width=800, height=800)
	pheatmap(samplePoisDistMatrix,
			 clustering_distance_rows = poisd$dd,
			 clustering_distance_cols = poisd$dd,
			 col = colors,
			 main = "Heatmap of sample-to-sample distances (Poisson) without normalization")
	dev.off()

	# ------------ 4.4 PCA plot -------------------

	# ----------------- 4.4.1 Custom PCA plot --------------
	# Build own plot with ggplot -> to distinguish subgroups more clearly
	# Each unique combination of treatment and cell-line has unique color
	# Use function that is provided with DeSeq2
	pcaData <- plotPCA(transformed_dds, intgroup = c("condition", "add_info"), returnData=TRUE)
	percentVar <- round(100 * attr(pcaData, "percentVar"))

	print("Creating custom PCA plot")
	jpeg(output_files[3], width=800, height=800)
	customPCAPlot <- ggplot(pcaData, aes(x=PC1, y=PC2, color=condition, shape=add_info, label=name)) +
		geom_point(size =3) +
		geom_text(check_overlap=TRUE, hjust=0, vjust=1) +
		xlab(paste0("PC1: ", percentVar[1], "% variance")) +
		ylab(paste0("PC2: ", percentVar[2], "% variance")) +
		coord_fixed() +
		ggtitle("PCA on transformed (rlog) data with subgroups (see shapes)")
	print(customPCAPlot)
	dev.off()

	# ----------------- 4.4.2 Generalized PCA plot --------------
	# Generalized PCA: Operates on raw counts, avoiding pitfalls of normalization
	print("Creating generalized PCA plot")
	gpca <- glmpca(counts(dds), L=2)
	gpca.dat <- gpca$factors
	gpca.dat$condition <- dds$condition
	gpca.dat$add_info <- dds$add_info

	jpeg(output_files[4], width=800, height=800)
	generalizedPCAPlot <- ggplot(gpca.dat, aes(x=dim1, y=dim2, color=condition, shape=add_info,
											   label=rownames(gpca.dat))) +
	  	geom_point(size=2) +
		geom_text(check_overlap=TRUE, hjust=0.5,vjust=1) +
		coord_fixed() +
		ggtitle("glmpca - Generalized PCA of samples")
	print(generalizedPCAPlot)
	dev.off()
}


# ----------------- Main function -----------------
main_function <- function(){
	threads <- snakemake@threads[[1]]
	register(MulticoreParam(workers=threads))

	# Snakemake variables
	deseq_dataset_obj <- snakemake@input[["deseq_dataset_r_obj"]]
	output_file_paths <- snakemake@params[["output_file_paths"]]

    # Load deseq dataset object
    dds <- readRDS(deseq_dataset_obj)

	# Run explorative analysis
	run_deseq2_explorative_analysis(dds, output_file_paths)
}


# ----------------- Run main function -----------------
main_function()

R ggplot2 DESeq2 org.Hs.eg.db magrittr RColorBrewer BiocParallel PCAtools pheatmap HTSeq tximport SummarizedExperiment AnnotationDbi genefilter rhdf5 tximeta apeglm PoiClaClu From line 9 of deseq2/explore_deseq_dataset.R

library("BiocParallel")

# Load data
library("tximeta")	# Import transcript quantification data from Salmon
library("tximport")	# Import transcript-level quantification data from Kaleidoscope, Sailfish, Salmon, Kallisto, RSEM, featureCounts, and HTSeq
library("rhdf5")
library("SummarizedExperiment")

library("magrittr")	# Pipe operator
library("DESeq2")		# Differential gene expression analysis

# Plotting libraries
library("pheatmap")
library("RColorBrewer")
library("ggplot2")

# Mixed
library("PoiClaClu")
library("glmpca")
library("apeglm")
library("ashr")
library("genefilter")
library("AnnotationDbi")
library("org.Hs.eg.db")

library("ReportingTools")	# For creating HTML reports



# ---------------- Helper functions ----------------
savely_create_deseq2_object <- function(dds) {
	### Function to create a DESeq2 object -> Handles errors that can appear due to parallelization
	### Input: dds object (DESeq dataset object)
	### Output: dds object (DESeq2 object)

	# ------------- 5. Run the differential expression analysis ---------------
	# The respective steps of this function are printed out
	# 1. Estimation of size factors: Controlling for differences
	# in the sequencing depth of the samples
	# 2. Estimation of dispersion values for each gene & fitting a generalized
	# linear model
	print("Creating DESeq2 object")

	# Try to create the DESeq2 object with results in Parallel
	create_obj_parallelized <- function(){
		print("Creating DESeq2 object in parallel")
		dds <- DESeq2::DESeq(dds, parallel=TRUE)
		return(list("dds"=dds, "run_parallel"=TRUE))
	}
	# Try to create the DESeq2 object with results in Serial
	create_obj_not_parallelized <- function(error){
		print("Error in parallelized DESeq2 object creation"); print(error)
		print("Creating DESeq2 object not in parallel")
		dds <- DESeq2::DESeq(dds, parallel=FALSE)
		return(list("dds"=dds, "run_parallel"=FALSE))
	}

	result_list <- tryCatch(create_obj_parallelized(), error=create_obj_not_parallelized)
	print("DESeq2 object created!")
	return(result_list)
}

rename_rownames_with_ensembl_id_matching <- function(dds_object, input_algorithm) {
  	"
	Extracts Ensembl-Gene-IDs from rownames of SummarizedExperiment object and
	renames rownames with Ensembl-Gene-IDs.
	"
  print("Gene annotations")
  if (input_algorithm == "salmon") {
    # Ensembl-Transcript-IDs at first place
    gene_ids_in_rows <- substr(rownames(dds_object), 1, 15)
  }
  else if (input_algorithm == "kallisto") {
    # Ensembl-Transcript-IDs at second place (delimeter: "|")
    gene_ids_in_rows <- sapply(rownames(dds_object), function(x) strsplit(x, '\\|')[[1]], USE.NAMES=FALSE)[2,]
    gene_ids_in_rows <- sapply(gene_ids_in_rows, function(x) substr(x, 1, 15), USE.NAMES=FALSE)
  }
  else {
    stop("Unknown algorithm used for quantification")
  }

  # Set new rownames
  rownames(dds_object) <- gene_ids_in_rows
  return(dds_object)
}


add_gene_symbol_and_entrez_id_to_results <- function(result_object, with_entrez_id=FALSE) {
	"
	Adds gene symbols and entrez-IDs to results object.
	"
	gene_ids_in_rows <- rownames(result_object)

	# Add gene symbols
	# Something breaks here when setting a new column name
	result_object$symbol <- AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db,
												 keys=gene_ids_in_rows,
												 column="SYMBOL",
												 keytype="ENSEMBL",
												 multiVals="first")
	if (with_entrez_id) {
		# Add ENTREZ-ID
		result_object$entrez <- AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db,
													 keys=gene_ids_in_rows,
													 column="ENTREZID",
													 keytype="ENSEMBL",
													 multiVals="first")
	}

	return(result_object)
}


# ---------------- DESeq2 analysis ----------------
explore_deseq2_results <- function(dds, false_discovery_rate, output_file_paths, run_parallel=FALSE,
                                   used_algorithm) {
	# Results: Metadata
	# 1. baseMean: Average/Mean of the normalized count values divided by size factors, taken over ALL samples
	# 2. log2FoldChange: Effect size estimate. Change of gene's expression
	# 3. lfcSE: Standard Error estimate for log2FoldChange
	# 4. Wald statistic results
	# 5. Wald test p-value ->  p value indicates the probability that a fold change as strong as the observed one, or even stronger, would be seen under the situation described by the null hypothesis.
	# 6. BH adjusted p-value
	print("Creating DESeq2 results object")
	results_obj <- results(dds, alpha=false_discovery_rate, parallel=run_parallel)
	capture.output(summary(results_obj), file=output_file_paths[1])


	# ------------------ 6. Plotting results --------------------
	# Contrast usage
	# TODO: Failed... -> Remove
	# print("Plotting results")
	# chosen_contrast <- tail(resultsNames(results_obj), n=1)	     # get the last contrast: Comparison of states
	# print("resultsNames(results_obj)"); print(resultsNames(results_obj))
	# print("chosen_contrast"); print(chosen_contrast)

	# ------------ 6.1 MA plot without shrinking --------------
	# - M: minus <=> ratio of log-values -> log-Fold-change on Y-axis
	# - A: average -> Mean of normalized counts on X-axis

	# res.noshr <- results(dds, contrast=chosen_contrast, parallel=run_parallel)
	res.no_shrink <- results(dds, parallel=run_parallel)
	jpeg(output_file_paths[2], width=800, height=800)
	DESeq2::plotMA(res.no_shrink, ylim = c(-5, 5), main="MA plot without shrinkage")
	dev.off()

	# ------------ 6.2 MA plot with apeGLM shrinking --------------
	# apeglm method for shrinking coefficients
	# -> which is good for shrinking the noisy LFC estimates while
	# giving low bias LFC estimates for true large differences

	# TODO: apeglm requires coefficients. However, resultsNames(results_obj) does not return any coefficients...
	# res <- lfcShrink(dds, coef=chosen_contrast, type="apeglm", parallel=run_parallel)      # Pass contrast and shrink results
	# Use ashr as shrinkage method
	res <- DESeq2::lfcShrink(dds, res=res.no_shrink, type="ashr", parallel=run_parallel)
	jpeg(output_file_paths[3], width=800, height=800)
	DESeq2::plotMA(res, ylim = c(-5, 5), main="MA plot with ashr shrinkage")
	dev.off()

	# ------------ 6.3 Plot distribution of p-values in histogram --------------
	jpeg(output_file_paths[4], width=800, height=800)
	hist(res$pvalue[res$baseMean > 1], breaks = 0:20/20, col = "grey50", border = "white",
		main="Histogram of distribution of p-values (non-adjusted)", xlab="p-value", ylab="Frequency")
	dev.off()

	jpeg(output_file_paths[5], width=800, height=800)
	hist(res$padj[res$baseMean > 1], breaks = 0:20/20, col = "grey50", border = "white",
		main="Histogram of distribution of p-values (adjusted)", xlab="p-value", ylab="Frequency")
	dev.off()

	# ------------- 6.4 Gene clustering -----------------
	print("Plotting results: Gene clustering")
	# Gene clustering -> Heatmap of divergence of gene's expression in comparison to average over all samples
	# Transform count results to reduce noise for low expression genes
	transformed_dds <- DESeq2::rlog(dds, blind=FALSE)

	# Get top Genes -> with most variance in VSD-values/rlog-transformed counts
	topVarGenes <- head(order(genefilter::rowVars(SummarizedExperiment::assay(transformed_dds)), decreasing=TRUE), 20)
	mat  <- SummarizedExperiment::assay(transformed_dds)[ topVarGenes, ]
	mat  <- mat - rowMeans(mat)   # difference to mean expression
	# Transform row names to gene symbols
	rownames(mat) <- add_gene_symbol_and_entrez_id_to_results(mat)$symbol
	# Additional annotations
	anno <- as.data.frame(SummarizedExperiment::colData(transformed_dds)[, c("condition", "add_info")])
	# Create plot
	jpeg(output_file_paths[6], width=800, height=800)
	pheatmap::pheatmap(mat, annotation_col=anno,
			 main="Divergence in gene expression in comparison to average over all samples")
	dev.off()

	# ---------- 7. Gene annotations --------------
	res <- add_gene_symbol_and_entrez_id_to_results(res)
	resOrdered <- res[order(res$pvalue),]					# Sort results by p-value

	# Exporting results
	resOrderedDF <- as.data.frame(resOrdered)
	write.csv(resOrderedDF, file=output_file_paths[7])
}



# ----------------- Main function -----------------
main_function <- function(){
	threads <- snakemake@threads[[1]]
	register(MulticoreParam(workers=threads))

	# Snakemake variables
	deseq_dataset_obj <- snakemake@input[["deseq_dataset_r_obj"]]
	output_file_paths <- snakemake@params[["output_file_paths"]]
	# For gene-ID matching: Used in rename_rownames_with_ensembl_id_matching()
	used_algorithm <- snakemake@params["used_algorithm"]
	# Adjusted p-value threshold
	false_discovery_rate <- 0.05

    # Load deseq dataset object
    dds <- readRDS(deseq_dataset_obj)

	# Create DESeq2 results object
	print("Creating DESeq2 results object")
	result_list <- savely_create_deseq2_object(dds)
	deseq2_obj <- result_list$dds
	run_parallel <- result_list$run_parallel

	# Rename rows
	deseq2_obj <- rename_rownames_with_ensembl_id_matching(deseq2_obj, used_algorithm)

	# Run statistical analysis
	print("Running statistical analysis")
	explore_deseq2_results(deseq2_obj, false_discovery_rate, output_file_paths, run_parallel=run_parallel,
	                       used_algorithm=used_algorithm)
}


# ----------------- Run main function -----------------
main_function()

R ggplot2 DESeq2 Salmon org.Hs.eg.db kallisto magrittr RColorBrewer BiocParallel pheatmap HTSeq tximport SummarizedExperiment AnnotationDbi genefilter rhdf5 tximeta apeglm PoiClaClu ReportingTools From line 9 of deseq2/gene_expression_analysis_with_deseq2.R

library("DESeq2")
library("AnnotationDbi")
library("org.Hs.eg.db")
library("clusterProfiler")
library("ggplot2")
library("enrichplot")


create_gsea_plots <- function(gsea_obj, dotplot_file_path, gsea_plot_file_path_1, gsea_plot_file_path_2, method) {
  # Create plots for GSEA results

  # -------- Plottings ---------
  # 1. Dotplot
  jpeg(dotplot_file_path,width=800, height=800)
  print(clusterProfiler::dotplot(gsea_obj, showCategory=30)
          + ggplot2::ggtitle(paste0("DotPlot for GSE-analysis (top 30 results) with method: ", method)))
  dev.off()

  # 2. GSEA-Plot for top 10 results
  jpeg(gsea_plot_file_path_1, width=800, height=800)
  print(enrichplot::gseaplot2(gsea_obj, geneSetID = 1:5, pvalue_table=FALSE,
                             title = paste0("GSEA-Plot for top 1-5 results with method: ", method)))
  dev.off()

  jpeg(gsea_plot_file_path_2, width=800, height=800)
  print(enrichplot::gseaplot2(gsea_obj, geneSetID = 6:10, pvalue_table=FALSE,
                             title = paste0("GSEA-Plot for top 6-10 results with method: ", method)))
  dev.off()

  # for (count in c(1:10)) {
  #   jpeg(paste0(plot_output_dir, "/gsea_plot_", method, "_", count, ".jpg"), width=800, height=800)
  #   print(clusterProfiler::gseaplot(gsea_obj, geneSetID=1, pvalue_table=TRUE))
  #   dev.off()
  # }
}

explore_gsea_go <- function(ordered_gene_list, summary_file_path, gsea_obj_file_path) {
  # Gene Set Enrichtment Analyses
  # params:
  #   ordered_gene_list: Ordered (i.e. deseq2 stat) list of genes

  # ----- 1. GO Enrichment --------
  # GO comprises three orthogonal ontologies, i.e. molecular function (MF),
  # biological process (BP), and cellular component (CC)
  go_gsea <- clusterProfiler::gseGO(ordered_gene_list,
               ont = "BP",
               keyType = "ENSEMBL",
               OrgDb = "org.Hs.eg.db",
               verbose = TRUE)

  df_go_gsea <- as.data.frame(go_gsea)
  df_go_gsea <- df_go_gsea[order(df_go_gsea$p.adjust),]
  write.csv(df_go_gsea, file=summary_file_path)

  # Save GSEA object
  saveRDS(go_gsea, file=gsea_obj_file_path)

  return(go_gsea)
}


explore_gsea_kegg <- function(ordered_gene_list, summary_file_path, gsea_obj_file_path) {
  # KEGG pathway enrichment analysis
  # params:
  #   ordered_gene_list: Ordered (i.e. deseq2 stat) list of genes

  names(ordered_gene_list) <- mapIds(org.Hs.eg.db, keys=names(ordered_gene_list), column="ENTREZID",
                                                      keytype="ENSEMBL", multiVals="first")
  # res$symbol <- mapIds(org.Hs.eg.db, keys=row.names(res), column="SYMBOL", keytype="ENSEMBL", multiVals="first")
  # res$entrez <- mapIds(org.Hs.eg.db, keys=row.names(res), column="ENTREZID", keytype="ENSEMBL", multiVals="first")
  # res$name <- mapIds(org.Hs.eg.db, keys=row.names(res), column="GENENAME", keytype="ENSEMBL", multiVals="first")

  # ----- 1. GO Enrichment --------
  # GO comprises three orthogonal ontologies, i.e. molecular function (MF),
  # biological process (BP), and cellular component (CC)
  kegg_gsea <- clusterProfiler::gseKEGG(geneList=ordered_gene_list,
                                        organism='hsa',
                                        verbose=TRUE)

  df_kegg_gsea <- as.data.frame(kegg_gsea)
  df_kegg_gsea <- df_kegg_gsea[order(df_kegg_gsea$p.adjust),]
  write.csv(df_kegg_gsea, file=summary_file_path)

  # Save GSEA object
  saveRDS(kegg_gsea, file=gsea_obj_file_path)
  return(kegg_gsea)
}


explore_gsea_wp <- function(ordered_gene_list,
                            summary_file_path, gsea_obj_file_path) {
  # WikiPathway
  # params:
  #   ordered_gene_list: Ordered (i.e. deseq2 stat) list of genes

  names(ordered_gene_list) <- mapIds(org.Hs.eg.db, keys=names(ordered_gene_list), column="ENTREZID",
                                                      keytype="ENSEMBL", multiVals="first")

  # ----- 1. GO Enrichment --------
  # GO comprises three orthogonal ontologies, i.e. molecular function (MF),
  # biological process (BP), and cellular component (CC)
  wp_gsea <- clusterProfiler::gseWP(
    geneList=ordered_gene_list,
    organism="Homo sapiens",
    verbose=TRUE)

  df_wp_gsea <- as.data.frame(wp_gsea)
  df_wp_gsea <- df_wp_gsea[order(df_wp_gsea$p.adjust),]
  write.csv(df_wp_gsea, file=summary_file_path)

  # Save GSEA object
  saveRDS(wp_gsea, file=gsea_obj_file_path)
  return(wp_gsea)
}


main <- function() {
  # Input
  input_dseq_dataset_obj <- snakemake@input$deseq_dataset_obj

  # Outputs
  gsea_go_obj_file_path <- snakemake@output$gsea_go_obj_file_path
  gsea_go_summary_file_path <- snakemake@output$gsea_go_summary_file_path
  gsea_kegg_obj_file_path <- snakemake@output$gsea_kegg_obj_file_path
  gsea_kegg_summary_file_path <- snakemake@output$gsea_kegg_summary_file_path
  gsea_wp_obj_file_path <- snakemake@output$gsea_wp_obj_file_path
  gsea_wp_summary_file_path <- snakemake@output$gsea_wp_summary_file_path

  # DotPlots
  dotplot_gsea_go_file_path <- snakemake@output$dotplot_gsea_go_file_path
  dotplot_gsea_kegg_file_path <- snakemake@output$dotplot_gsea_kegg_file_path
  dotplot_gsea_wp_file_path <- snakemake@output$dotplot_gsea_wp_file_path
  # GSEA Plots
  gsea_go_top10_plot_file_path_1 <- snakemake@output$gsea_go_top10_plot_file_path_1
  gsea_kegg_top10_plot_file_path_1 <- snakemake@output$gsea_kegg_top10_plot_file_path_1
  gsea_wp_top10_plot_file_path_1 <- snakemake@output$gsea_wp_top10_plot_file_path_1
  gsea_go_top10_plot_file_path_2 <- snakemake@output$gsea_go_top10_plot_file_path_2
  gsea_kegg_top10_plot_file_path_2 <- snakemake@output$gsea_kegg_top10_plot_file_path_2
  gsea_wp_top10_plot_file_path_2 <- snakemake@output$gsea_wp_top10_plot_file_path_2

  # Params
  input_algorithm <- snakemake@params$input_algorithm

  # Load DataSet
  dds <- readRDS(input_dseq_dataset_obj)
  # Create DESeq2 object
  dds <- DESeq(dds)
  res <- DESeq2::results(dds)

  # Filtering
  res <- na.omit(res)
  res <- res[res$baseMean >50,] # Filter out genes with low expression

  # Order output -> We choose stat, which takes log-Fold as well as SE into account
  # Alternative: lfc * -log10(P-value)
  # order descending so use minus sign
  res <- res[order(-res$stat),]

  # --------- Create input gene list ---------------
  # Extract stat values
  gene_list <- res$stat
  # Add rownames
  if (input_algorithm == "salmon") {
    # Ensembl-Transcript-IDs at first place
    names(gene_list) <- substr(rownames(res), 1, 15)
  }
  else if (input_algorithm == "kallisto") {
    # Ensembl-Transcript-IDs at second place (delimeter: "|")
    gene_ids_in_rows <- sapply(rownames(res), function(x) strsplit(x, '\\|')[[1]], USE.NAMES=FALSE)[2,]
    gene_ids_in_rows <- sapply(gene_ids_in_rows, function(x) substr(x, 1, 15), USE.NAMES=FALSE)
    names(gene_list) <- gene_ids_in_rows
  }
  else {
    stop("Unknown algorithm used for quantification")
  }

  # =========== Run  GSEA ===========
  # ----- 1. GO Enrichment --------
  go_gsea_obj <- explore_gsea_go(gene_list, gsea_go_summary_file_path, gsea_go_obj_file_path)
  create_gsea_plots(go_gsea_obj, dotplot_gsea_go_file_path,
                    gsea_go_top10_plot_file_path_1, gsea_go_top10_plot_file_path_2, "go")

  # ----- 2. KEGG Enrichment --------
  kegg_gsea_obj <- explore_gsea_kegg(gene_list, gsea_kegg_summary_file_path, gsea_kegg_obj_file_path)
  create_gsea_plots(kegg_gsea_obj, dotplot_gsea_kegg_file_path,
                    gsea_kegg_top10_plot_file_path_1, gsea_kegg_top10_plot_file_path_2, "kegg")

  # ----- 3. WikiPathway Enrichment --------
  wp_gsea_obj <- explore_gsea_wp(gene_list, gsea_wp_summary_file_path, gsea_wp_obj_file_path)
  create_gsea_plots(wp_gsea_obj, dotplot_gsea_wp_file_path,
                    gsea_wp_top10_plot_file_path_1, gsea_wp_top10_plot_file_path_2, "wp")
}

# Run main function
main()

R ggplot2 DESeq2 Salmon org.Hs.eg.db kallisto GSEA AnnotationDbi clusterProfiler enrichplot From line 1 of deseq2/gene_set_enrichment_analysis.R

library("FRASER")

library("TxDb.Hsapiens.UCSC.hg19.knownGene")
library("org.Hs.eg.db")

# Requierements: 1. Sample annotation,
# 2. Two count matrices are needed: one containing counts for the splice junctions, i.e. the
# split read counts, and one containing the splice site counts, i.e. the counts of non
# split reads overlapping with the splice sites present in the splice junctions.


set_up_fraser_dataset_object <- function(sample_annotation_file_path) {
  #' Function to set up a FRASER object
  #'
  #' @param sample_annotation_file_path     Path to sample annotation file
  #' @param output_dir_path     Path to output directory
  #'
  #' @return FRASER object

  # Load annotation file
  annotationTable <- fread(sample_annotation_file_path, header=TRUE, sep="\t", stringsAsFactors=FALSE)
  annotationTable$bamFile <- file.path(annotationTable$bamFile)   # Required for FRASER

  # --------------- Creating a FRASER object ----------------
  # create FRASER object
  settings <- FraserDataSet(colData=annotationTable, name="Fraser Dataset")

  # Via count reads
  fds <- countRNAData(settings)

  # Via raw counts
  # junctionCts <- fread(additional_junction_counts_file, header=TRUE, sep="\t", stringsAsFactors=FALSE)
  # spliceSiteCts <- fread(additional_splice_site_counts_file, header=TRUE, sep="\t", stringsAsFactors=FALSE)
  # fds <- FraserDataSet(colData=annotationTable, junctions=junctionCts, spliceSites=spliceSiteCts, workingDir="FRASER_output")

  return(fds)
}


run_filtering <- function(fraser_object,
                          plot_filter_expression_file, plot_cor_psi5_heatmap_file,
                          plot_cor_psi3_heatmap_file, plot_cor_theta_heatmap_file) {
  #' Function to run filtering
  #'
  #' @param fraser_object     FRASER object
  #' @param output_dir_path     Path to output directory
  #'
  #' @return FRASER object


  # --------------- Filtering ----------------
  # Compute main splicing metric -> The PSI-value
  fds <- calculatePSIValues(fraser_object)
  # Run filters on junctions: At least one sample has 20 reads, and at least 5% of the samples have at least 1 reads
  # Filter=FALSE, since we first plot and subsequently apply subsetting
  fds <- filterExpressionAndVariability(fds,
                                        minExpressionInOneSample=20,
                                        minDeltaPsi=0.0,  # Only junctions with a PSI-value difference of at least x% between two samples are considered
                                        filter=FALSE       # If TRUE, a subsetted fds containing only the introns that passed all filters is returned.
                                        )

  # Plot filtering results
  jpeg(plot_filter_expression_file, width=800, height=800)
  print(plotFilterExpression(fds, bins=100))
  dev.off()

  # Finally apply filter results
  fds_filtered <- fds[mcols(fds, type="j")[,"passed"],]

  # ---------------- Heatmaps of correlations ----------------
  # 1. Correlation of PSI5
  tryCatch(
    expr = {
      # Heatmap of the sample correlation
      jpeg(plot_cor_psi5_heatmap_file, width=800, height=800)
      plotCountCorHeatmap(fds_filtered, type="psi5", logit=TRUE, normalized=FALSE)
      dev.off()
    },
    error = function(e) {
        print("Error in creating Heatmap of the sample correlation")
        print(e)
    }
  )
  # tryCatch(
  #   expr = {
  #     # Heatmap of the intron/sample expression
  #     jpeg(plot_cor_psi5_top100_heatmap_file, width=800, height=800)
  #     plotCountCorHeatmap(fds_filtered, type="psi5", logit=TRUE, normalized=FALSE,
  #                     plotType="junctionSample", topJ=100, minDeltaPsi = 0.01)
  #     dev.off()
  #   },
  #   error = function(e) {
  #       print("Error in creating Heatmap of the intron/sample expression")
  #       print(e)
  #   }
  # )

  # 2. Correlation of PSI3
  tryCatch(
      expr = {
      # Heatmap of the sample correlation
      jpeg(plot_cor_psi3_heatmap_file, width=800, height=800)
      plotCountCorHeatmap(fds_filtered, type="psi3", logit=TRUE, normalized=FALSE)
      dev.off()
      },
      error = function(e) {
          print("Error in creating Heatmap of the sample correlation")
          print(e)
      }
  )
  # tryCatch(
  #   expr = {
  #     # Heatmap of the intron/sample expression
  #     jpeg(plot_cor_psi3_top100_heatmap_file, width=800, height=800)
  #     plotCountCorHeatmap(fds_filtered, type="psi3", logit=TRUE, normalized=FALSE,
  #                     plotType="junctionSample", topJ=100, minDeltaPsi = 0.01)
  #     dev.off()
  #   },
  #   error = function(e) {
  #       print("Error in creating Heatmap of the intron/sample expression")
  #       print(e)
  #   }
  # )

  # 3. Correlation of Theta
  tryCatch(
      expr = {
      # Heatmap of the sample correlation
      jpeg(plot_cor_theta_heatmap_file, width=800, height=800)
      plotCountCorHeatmap(fds_filtered, type="theta", logit=TRUE, normalized=FALSE)
      dev.off()
      },
      error = function(e) {
          print("Error in creating Heatmap of the sample correlation")
          print(e)
      }
  )
  # tryCatch(
  #   expr = {
  #     # Heatmap of the intron/sample expression
  #     jpeg(plot_cor_theta_top100_heatmap_file, width=800, height=800)
  #     plotCountCorHeatmap(fds_filtered, type="theta", logit=TRUE, normalized=FALSE,
  #                     plotType="junctionSample", topJ=100, minDeltaPsi = 0.01)
  #     dev.off()
  #   },
  #   error = function(e) {
  #       print("Error in creating Heatmap of the intron/sample expression")
  #       print(e)
  #   }
  # )

  return(fds_filtered)
}


detect_dif_splice <- function(fraser_object, output_fraser_analysis_set_object_file,
                              plot_normalized_cor_psi5_heatmap_file,
                              plot_normalized_cor_psi3_heatmap_file,
                              plot_normalized_cor_theta_heatmap_file) {
  #' Function to detect differential splicing
  #'
  #' @param fraser_object     FRASER object
  #' @param output_dir_path     Path to output directory
  #' @param summary_table_file     Path to summary table file
  #'
  #' @return FRASER object


  # ----------------- Detection of differential splicing -----------------
  # 1. Fitting the splicing model:
  # Normalizing data and correct for confounding effects by using a denoising autoencoder
  # This is computational heavy on real size datasets and can take awhile

  # q: The encoding dimension to be used during the fitting procedure. Can be fitted with optimHyperParams
  # see: https://rdrr.io/bioc/FRASER/man/optimHyperParams.html
  fds <- FRASER(fraser_object, q=c(psi5=3, psi3=5, theta=2))

  # Plot 1: PSI5
  tryCatch(
    expr = {
      # Check results in heatmap
      jpeg(plot_normalized_cor_psi5_heatmap_file, width=800, height=800)
      plotCountCorHeatmap(fds, type="psi5", normalized=TRUE, logit=TRUE)
      dev.off()
    },
    error = function(e) {
        print("Error in creating Heatmap of the sample correlation")
        print(e)
    }
  )

  # Plot 2: PSI3
  tryCatch(
      expr = {
      # Check results in heatmap
      jpeg(plot_normalized_cor_psi3_heatmap_file, width=800, height=800)
      plotCountCorHeatmap(fds, type="psi3", normalized=TRUE, logit=TRUE)
      dev.off()
      },
      error = function(e) {
          print("Error in creating Heatmap of the sample correlation")
          print(e)
      }
  )

  # Plot 3: Theta
  tryCatch(
      expr = {
      # Check results in heatmap
      jpeg(plot_normalized_cor_theta_heatmap_file, width=800, height=800)
      plotCountCorHeatmap(fds, type="theta", normalized=TRUE, logit=TRUE)
      dev.off()
      },
      error = function(e) {
          print("Error in creating Heatmap of the sample correlation")
          print(e)
      }
  )


  # 2. Differential splicing analysis
  # 2.1 annotate introns with the HGNC symbols of the corresponding gene
  txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
  orgDb <- org.Hs.eg.db
  fds <- annotateRangesWithTxDb(fds, txdb=txdb, orgDb=orgDb)

  # 2.2 retrieve results with default and recommended cutoffs (padj <= 0.05 and |deltaPsi| >= 0.3)
  print("Saving FraserAnalysisDataSetTest results")
  # Saves RDS-file into savedObjects folder
  saveFraserDataSet(fds, dir=dirname(dirname(output_fraser_analysis_set_object_file)),
                    name=basename(output_fraser_analysis_set_object_file))


  # ----------------- Finding splicing candidates in patients -----------------
  # -> Plotting the results
  # tryCatch(
  #   expr = {
      # -------- Sample specific plots --------
      # jpeg(file.path(output_dir_path, "psi5_volcano_plot_sample1.jpg"), width=800, height=800)
      # plotVolcano(fds, type="psi5", annotationTable$sampleID[1])
      # dev.off()

      # jpeg(file.path(output_dir_path, "psi5_expression_sample1.jpg"), width=800, height=800)
      # plotExpression(fds, type="psi5", result=sampleRes[1])
      # dev.off()

      # jpeg(file.path(output_dir_path, "expected_vs_observed_psi_sample1.jpg"), width=800, height=800)
      # plotExpectedVsObservedPsi(fds, result=sampleRes[1])
      # dev.off()
  #   },
  #   error = function(e) {
  #       print("Error in creating plots")
  #       print(e)
  #   }
  # )

  return(fds)
  }


main_function <- function() {
  in_sample_annotation_file <- snakemake@input[["sample_annotation_file"]]

  # Output: Plot files - After filtering, no normalization
  plot_filter_expression_file <- snakemake@output[["plot_filter_expression_file"]]
  plot_cor_psi5_heatmap_file <- snakemake@output[["plot_cor_psi5_heatmap_file"]]
  plot_cor_psi3_heatmap_file <- snakemake@output[["plot_cor_psi3_heatmap_file"]]
  plot_cor_theta_heatmap_file <- snakemake@output[["plot_cor_theta_heatmap_file"]]

  # ToDO: Set plotType to "sampleCorrelation", however this plots are not helpful and can be ignored...
  # plot_cor_psi5_top100_heatmap_file <- snakemake@output[["plot_cor_psi5_top100_heatmap_file"]]
  # plot_cor_psi3_top100_heatmap_file <- snakemake@output[["plot_cor_psi3_top100_heatmap_file"]]
  # plot_cor_theta_top100_heatmap_file <- snakemake@output[["plot_cor_theta_top100_heatmap_file"]]

  # Output: Plot files - After filtering, normalization
  plot_normalized_cor_psi5_heatmap_file <- snakemake@output[["plot_normalized_cor_psi5_heatmap_file"]]
  plot_normalized_cor_psi3_heatmap_file <- snakemake@output[["plot_normalized_cor_psi3_heatmap_file"]]
  plot_normalized_cor_theta_heatmap_file <- snakemake@output[["plot_normalized_cor_theta_heatmap_file"]]

  # Output: Differential splicing analysis
  output_fraser_dataset_object_file <- snakemake@output[["fraser_data_set_object_file"]]


  # TODO: Integrate additional count files from external resources -> Failed...
  # additional_junction_counts_file <- snakemake@params[["additional_junction_counts_file"]]
  # additional_splice_site_counts_file <- snakemake@params[["additional_splice_site_counts_file"]]

  threads <- snakemake@threads
  register(MulticoreParam(workers=threads))

  # 1. Create FRASER object
  fraser_obj <- set_up_fraser_dataset_object(in_sample_annotation_file)
  print("FRASER: FRASER dataset object created")

  # 2. Run filtering
  filtered_fraser_obj <- run_filtering(fraser_obj,
                                       plot_filter_expression_file,
                                       plot_cor_psi5_heatmap_file,
                                       plot_cor_psi3_heatmap_file,
                                       plot_cor_theta_heatmap_file)
  print("FRASER: Filtering done")

  # 3. Detect differential splicing
  detect_dif_splice(filtered_fraser_obj, output_fraser_dataset_object_file,
                    plot_normalized_cor_psi5_heatmap_file,
                    plot_normalized_cor_psi3_heatmap_file,
                    plot_normalized_cor_theta_heatmap_file
                    )
  print("FRASER: Differential splicing analysis done")
}

main_function()

R org.Hs.eg.db TxDb.Hsapiens.UCSC.hg19.knownGene FRASER From line 2 of fraser/fraser_dataset_exploration.R

library("AnnotationDbi")
library("org.Hs.eg.db")


extract_gene_id_from_info_col <- function(data_frame_obj, info_col, gene_id_col="gene_ensembl_id") {
	"
	Extracts gene ID from info column.
	"
	# Extract gene-IDs from info_col
	# Each entry in info_col looks like this:
	# gene_id "ENSG00000186092"; transcript_id "ENST00000335137"; exon_number "1"; gene_name "OR4F5"; gene_biotype "protein_coding"; transcript_name "OR4F5-201"; exon_id "ENSE00002234944";
	# Extract the first part of the string, i.e. the gene_id
	gene_ids <- lapply(data_frame_obj[info_col], FUN=function(x) {
		gene_id <- gsub(pattern=".*gene_id \"", replacement="", x=x)
		gene_id <- gsub(pattern="\";.*", replacement="", x=gene_id)
		return(gene_id)
		}
	)
	data_frame_obj[gene_id_col] <- gene_ids

	return(data_frame_obj)
}


add_gene_symbol_and_entrez_id_to_results <- function(data_frame_obj,
														gene_ensembl_id_col="gene_ensembl_id",
														gene_name_col="gene_name") {
	"
	Adds gene symbols and entrez-IDs to results object.
	"
	gene_ids_vector <- as.vector(t(data_frame_obj[gene_ensembl_id_col]))

	# If empty gene_ids_vector, then return fill with NA
	if (length(gene_ids_vector) == 0) {
		data_frame_obj[gene_name_col] <- character(0)
	}

	else {
		# Add gene symbols
		# Something breaks here when setting a new column name
		data_frame_obj[gene_name_col] <- AnnotationDbi::mapIds(org.Hs.eg.db::org.Hs.eg.db,
															  keys=gene_ids_vector,
															  column="SYMBOL",
															  keytype="ENSEMBL",
															  multiVals="first")
	}
	return(data_frame_obj)
}




# Main function
main <- function() {
	# Input
	input_table_files <- snakemake@input
	# Output
	output_files <- snakemake@output

	# info_col
	info_col_name <- snakemake@params[["info_col_name"]]
	gene_ensembl_id_col_name = snakemake@params[["gene_ensembl_id_col_name"]]
	gene_name_col_name = snakemake@params[["gene_name_col_name"]]


	# Loop over input files
	for (i in seq_along(input_table_files)) {
		# Read input table
		df <- read.table(toString(input_table_files[i]), sep="\t", header=TRUE, stringsAsFactors=FALSE)

		# Extract gene ID from info column
		df <- extract_gene_id_from_info_col(df, info_col=info_col_name, gene_id_col=gene_ensembl_id_col_name)

		# Add gene symbols and entrez-IDs
		df <- add_gene_symbol_and_entrez_id_to_results(df,
			gene_ensembl_id_col=gene_ensembl_id_col_name, gene_name_col=gene_name_col_name)


		# Put gene_ensembl_id_col and gene_name_col to the front
		input_table <- df[, c(gene_ensembl_id_col_name, gene_name_col_name,
			setdiff(colnames(df), c(gene_ensembl_id_col_name, gene_name_col_name)))]

		# Write output table
		write.table(input_table, file=toString(output_files[i]), sep="\t", quote=FALSE, row.names=FALSE)
	}
}


# Run main function
main()

R org.Hs.eg.db AnnotationDbi From line 1 of private_junction_detection/insert_gene_symbol_and_gene_id.R

KAS-Seq Analysis Workflow (v0.2)

library(BiocManager, quietly = TRUE)
library(ChIPseeker, quietly = TRUE)
library(org.Hs.eg.db, quietly = TRUE)
library(cowplot, quietly = TRUE)
library(readr, quietly = TRUE)
library(argparser, quietly = TRUE)

p <- arg_parser("KAS-Seq Peak Annotation and Comparison")
p <- add_argument(p, "--peak_files",
                  help = "Peak files to annotate and compare",
                  nargs = Inf)
p <- add_argument(p, "--txdb_file",
                  help = "File to load txdb using AnnotationDbi::loadDb()")
p <- add_argument(p, "--annotation_file",
                  help = paste0("GFF3 or GTF file of gene annotations used to ",
                                "build txdb"))
p <- add_argument(p, "--txdb",
                  help = "Name of txdb package to install from Bioconductor")

# Add an optional arguments
p <- add_argument(p, "--names", help = "Sample names for each peak file",
                  nargs = Inf)
p <- add_argument(p, "--output_dir",
                  help = "Directory for output files",
                  default = "peak_annotation")
p <- add_argument(p, "--annotation_distribution_plot",
                  help = "Peak annotation distribution barplot filename",
                  default = "annotationDistributionPlot.pdf")
p <- add_argument(p, "--peak_annotation_list_rdata",
                  help = "Peak annotation list Rdata file",
                  default = "peak_anno_list.Rdata")
p <- add_argument(p, "--venn_diagram",
                  help = paste0("Venn digagram of annotated genes per sample ",
                                "pdf filename"),
                  default = "annotationVennDiagram.pdf")

# Parse arguments (interactive, snakemake, or command line)
if (exists("snakemake")) {
  # Arguments via Snakemake
  argv <- parse_args(p, c(
    "--peak_files", snakemake@input[["peak_files"]],
    "--txdb_file", snakemake@input[["txdb_file"]],
    "--names", snakemake@params[["names"]],
    "--output_dir", snakemake@params[["output_dir"]],
    "--annotation_distribution_plot",
    snakemake@output[["annotation_distribution_plot"]],
    "--venn_diagram", snakemake@output[["venn_diagram"]],
    "--peak_annotation_list_rdata",
    snakemake@output[["peak_annotation_list_rdata"]]
  ))
} else if (interactive()) {
  # Arguments supplied inline (for debug/testing when running interactively)
  print("Running interactively...")
  peak_files <- c("results_2020-12-03/macs2/D701-lane1_peaks.broadPeak",
                  "results_2020-12-03/macs2/D702-lane1_peaks.broadPeak",
                  "results_2020-12-03/macs2/D703-lane1_peaks.broadPeak",
                  "results_2020-12-03/macs2/D704-lane1_peaks.broadPeak",
                  "results_2020-12-03/macs2/D705-lane1_peaks.broadPeak")
  names <- c("D701", "D702", "D703", "D704", "D705")
  annotation_file <- "genomes/hg38/annotation/Homo_sapiens.GRCh38.101.gtf"
  txdb_file <- "txdb.db"
  argv <- parse_args(p, c("--peak_files", peak_files,
                          "--names", names,
                          "--txdb_file", txdb_file))
  print(argv)
} else {
  # Arguments from command line
  argv <- parse_args(p)
  print(argv)
}

# Set names
if (!anyNA(argv$names)) {
  peak_file_names <- argv$names
} else {
  peak_file_names <- sapply(argv$peak_files, basename)
}
names(argv$peak_files) <- peak_file_names

# Output directory
if (!dir.exists(argv$output_dir)) {
  dir.create(argv$output_dir, recursive = TRUE)
}

# Get txdb object
if (!is.na(argv$txdb)) {
  # Load (install if needed) txdb from bioconductor
  library(pacman, quietly = TRUE)
  pacman::p_load(argv$txdb, character.only = TRUE)
  txdb <- eval(parse(text = argv$txdb))
} else if (!is.na(argv$txdb_file)) {
  # Load txdb
  library(AnnotationDbi, quietly = TRUE)
  txdb <- AnnotationDbi::loadDb(argv$txdb_file)
} else if (!is.na(argv$annotation_file)) {
  # Create txdb object from supplied annotation file
  library(GenomicFeatures, quietly = TRUE)
  txdb <- GenomicFeatures::makeTxDbFromGFF(argv$annotation_file)
} else {
  stop("Must specify one of --txdb, --txdb_file, or --annotation_file")
}


# Peak Annotation
# TODO Provide config parameter for annoDb
peak_anno_list <- lapply(argv$peak_files, annotatePeak, TxDb = txdb,
                       tssRegion = c(-3000, 3000), annoDb = "org.Hs.eg.db",
                       verbose = FALSE)
lapply(names(peak_anno_list), function(name) {
  filebase <- file.path(argv$output_dir, basename(argv$peak_files[[name]]))
  write_tsv(as.data.frame(peak_anno_list[[name]]),
            file = paste0(filebase, ".annotated.tsv.gz"))
  sink(file = paste0(filebase, ".annotated.summary.txt"))
  print(peak_anno_list[[name]])
  sink()
})
saveRDS(peak_anno_list,
        file = argv$peak_annotation_list_rdata)

# Peak annotation distribution plot
peak_anno_dist_plot <- plotAnnoBar(peak_anno_list)
save_plot(
  filename = argv$annotation_distribution_plot,
  plot = peak_anno_dist_plot,
  base_height = 14,
  base_width = 14
)

# nolint start
# Functional profiles comparison
# NOTE: Not all data return enrichment
# genes = lapply(peak_anno_list, function(i) as.data.frame(i)$geneId)
# names(genes) = sub("_", "\n", names(genes))
# compKEGG <- compareCluster(geneCluster   = genes,
#                            fun           = "enrichKEGG",
#                            pvalueCutoff  = 0.05,
#                            pAdjustMethod = "BH")
# dotplot(compKEGG, showCategory = 15, title = "KEGG Pathway Enrichment Analysis")
# nolint end

# Venn Diagram of gene annotated per sample
# Note: vennplot does not return a ggplot object
pdf(
  file = argv$venn_diagram,
  height = 14,
  width = 14
)
genes <- lapply(peak_anno_list, function(i) as.data.frame(i)$geneId)
vennplot(genes)
dev.off()

R Snakemake ggplot2 org.Hs.eg.db readr cowplot BiocManager AnnotationDbi argparser ChIPseeker PACMAN From line 3 of scripts/peak_annotation.R

Workflow Steps and Code Snippets

Easy Copy Number Analysis (EaCoN) Pipeline

cluster_rnaseq: RNA-seq pipeline.

A pipeline to connect GWAS Variant-to-Gene-to-Program (V2G2P) Approach

A Snakemake based modular Workflow that facilitates RNA-Seq analyses with a special focus on splicing

KAS-Seq Analysis Workflow (v0.2)

org.Hs.eg.db