BioWorkflows: Workflow dervied form https://github.com/snakemake-workflows/rna-seq-star-deseq2 for multi condition Deseq2 and enrichment analyis

script:
    "../scripts/test_carnival.R"

SnakeMake From line 32 of rules/Carnival.smk

script:
    "../scripts/test_carnival.R"

SnakeMake From line 66 of rules/Carnival.smk

script:
    "../scripts/RMD_scripts/carnival_downstream.Rmd"

SnakeMake From line 91 of rules/Carnival.smk

script:
    "../scripts/RMD_scripts/carnival_join.Rmd"

SnakeMake From line 112 of rules/Carnival.smk

script:
    "../scripts/transcriptutorial/sample_resolution_dorothea.R"

SnakeMake From line 132 of rules/Carnival.smk

script:
    "../scripts/transcriptutorial/sample_resolution_progeny.R"

SnakeMake From line 150 of rules/Carnival.smk

script:
    "../scripts/transcriptutorial/sample_resolution_carnival.R"

SnakeMake From line 176 of rules/Carnival.smk

script:
    "../scripts/transcriptutorial/06_analysis_CARNIVAL_results.Rmd"

SnakeMake From line 202 of rules/Carnival.smk

script:
    "../scripts/deseq2-init.R"

SnakeMake DESeq2 From line 18 of rules/diffexp.smk

script:
    "../scripts/plot-pca.R"

SnakeMake From line 35 of rules/diffexp.smk

script:
    "../scripts/deseq2.R"

SnakeMake DESeq2 From line 65 of rules/diffexp.smk

script:
    "../scripts/rlog_transform.R"

SnakeMake From line 83 of rules/diffexp.smk

script:
    "../scripts/DESeq2_analysis.Rmd"

SnakeMake From line 113 of rules/diffexp.smk

script:
    "../scripts/RMD_scripts/gsea_report.Rmd"

SnakeMake From line 158 of rules/diffexp.smk

script:
    "../scripts/DEseq2_cohort.Rmd"

SnakeMake From line 181 of rules/diffexp.smk

script:
    "../scripts/run_mitch.R"

SnakeMake From line 209 of rules/diffexp.smk

script:
    "../scripts/export_diffexp_xlsx.R"

SnakeMake From line 235 of rules/diffexp.smk

script:
    "../scripts/dorothea.Rmd"

SnakeMake From line 255 of rules/diffexp.smk

script:
    "../scripts/RMD_scripts/dorothea_diffexp.Rmd"

SnakeMake From line 281 of rules/diffexp.smk

script:
    "../scripts/RMD_scripts/progeny_analysis.Rmd"

SnakeMake From line 306 of rules/diffexp.smk

script:
    "../scripts/count-matrix.py"

SnakeMake From line 23 of rules/DKFZ.smk

script:
    "../scripts/gtf2bed.py"

SnakeMake From line 38 of rules/DKFZ.smk

shell:
    "junction_annotation.py {params.extra} -i {input.bam} -r {input.bed} -o {params.prefix} "
    "> {log[0]} 2>&1"

SnakeMake From line 66 of rules/DKFZ.smk

shell:
    "junction_saturation.py {params.extra} -i {input.bam} -r {input.bed} -o {params.prefix} "
    "> {log} 2>&1"

SnakeMake From line 96 of rules/DKFZ.smk

shell:
    "bam_stat.py -i {input} > {output} 2> {log}"

SnakeMake From line 121 of rules/DKFZ.smk

shell:
    "infer_experiment.py -r {input.bed} -i {input.bam} > {output} 2> {log}"

SnakeMake From line 143 of rules/DKFZ.smk

shell:
    "inner_distance.py -r {input.bed} -i {input.bam} -o {params.prefix} > {log} 2>&1"

SnakeMake From line 171 of rules/DKFZ.smk

shell:
    "read_distribution.py -r {input.bed} -i {input.bam} > {output} 2> {log}"

SnakeMake From line 196 of rules/DKFZ.smk

shell:
    "read_duplication.py -i {input} -o {params.prefix} > {log} 2>&1"

SnakeMake From line 223 of rules/DKFZ.smk

shell:
    "read_GC.py -i {input} -o {params.prefix} > {log} 2>&1"

SnakeMake From line 250 of rules/DKFZ.smk

wrapper:
    "v1.19.0/bio/multiqc"

SnakeMake MultiQC From line 331 of rules/DKFZ.smk

script:
    "../scripts/export_fpkm.py"

SnakeMake From line 348 of rules/DKFZ.smk

import pandas as pd


def get_column(strandedness):
    if pd.isnull(strandedness) or strandedness == "none":
        return 8  # non stranded protocol
    elif strandedness == "yes":
        return 9  # 3rd column
    elif strandedness == "reverse":
        return 10  # 4th column, usually for Illumina truseq
    else:
        raise ValueError(
            (
                "'strandedness' column should be empty or have the "
                "value 'none', 'yes' or 'reverse', instead has the "
                "value {}"
            ).format(repr(strandedness))
        )


def get_fpkm_column(strandedness):
    if pd.isnull(strandedness) or strandedness == "none":
        return 11  # non stranded protocol
    elif strandedness == "yes":
        return 12  # 3rd column
    elif strandedness == "reverse":
        return 13  # 4th column, usually for Illumina truseq
    else:
        raise ValueError(
            (
                "'strandedness' column should be empty or have the "
                "value 'none', 'yes' or 'reverse', instead has the "
                "value {}"
            ).format(repr(strandedness))
        )


counts = [
    pd.read_table(
        f, index_col=0, usecols=[3, get_column(strandedness)], header=None, skiprows=1
    )
    for f, strandedness in zip(snakemake.input, snakemake.params.strand)
]

fpkm = [
    pd.read_table(
        f,
        index_col=0,
        usecols=[3, get_fpkm_column(strandedness)],
        header=None,
        skiprows=1,
    )
    for f, strandedness in zip(snakemake.input, snakemake.params.strand)
]

gene_names = pd.read_table(snakemake.input[0], index_col=0, usecols=[3, 6], header=0)

for t, sample in zip(counts, snakemake.params.samples):
    t.columns = [sample]

for t, sample in zip(fpkm, snakemake.params.samples):
    t.columns = [sample]

matrix = pd.concat(counts, axis=1)
fpkm_mat = pd.concat(fpkm, axis=1)


matrix.index.name = "gene"
fpkm_mat.index.name = "gene"
gene_names.index.name = "gene"
#  collapse technical replicates
matrix = matrix.groupby(matrix.columns, axis=1).sum()
fpkm_mat = fpkm_mat.groupby(fpkm_mat.columns, axis=1).sum()

fpkm_mat["gname"] = gene_names["name"]

matrix.to_csv(snakemake.output[0], sep="\t")
fpkm_mat.to_csv(snakemake.output[1], sep="\t")

Python Pandas From line 1 of scripts/count-matrix.py

body .main-container {
  max-width: 1800px !important;
  width: 1800px !important;
}
body {
  max-width: 1800px !important;
}

R Markdown From line 14 of scripts/DESeq2_analysis.Rmd

knitr::opts_chunk$set(
  echo = FALSE, warning = FALSE, message = FALSE,
  fig.width = 12, fig.height = 12
)

library(tidyverse)
library(clusterProfiler)

library(DESeq2)
library(enrichplot)
library(ggupset)
library(patchwork)
library(biomaRt)
library(svglite)
if (!require("RNAscripts")) {
  devtools::install("./RNAscripts")
}
library("RNAscripts")
library("BiocParallel")
library(ComplexHeatmap)

R Markdown tidyverse BiocParallel patcHwork biomaRt ComplexHeatmap clusterProfiler svglite enrichplot ggupset From line 24 of scripts/DESeq2_analysis.Rmd

if (exists("snakemake")) {
  dds_path <- snakemake@input[["dds_obj"]]
  diffexp_tb_path <- snakemake@input[["table"]]
  fpkm_path <- snakemake@input[["fpkm"]]
  feature_counts_fp <- snakemake@input[["featureCounts"]]
  enrich_list <- snakemake@input[["gsea_result"]]
  cond_id <- snakemake@wildcards[["condition"]]
  write(cond_id, file = stderr())
  contrast_groups <- snakemake@params[["contrast"]]
  coef_string <- paste0(
    cond_id, "_", contrast_groups[[1]], "_vs_",
    contrast_groups[[2]]
  )
  samp_map <- snakemake@params[["samples"]]
  lfc_shrink <- snakemake@config[["diffexp"]][["shrink_lfc"]]
  rld_path <- snakemake@input[["rld"]]
  register(MulticoreParam(snakemake@threads))
  lfc_threshold <- snakemake@config[["diffexp"]][["LFC_threshold"]]
  pvalue_threshold <- snakemake@config[["diffexp"]][["pval_threshold"]]
  plot_path <- snakemake@params[["plot_path"]]
  group_colors <- snakemake@config[["group_colors"]][[cond_id]] %>% unlist()
  if (!is.null(snakemake@config[["diffexp"]][["custom_model"]][[cond_id]])) {
    model_string <- snakemake@config[["diffexp"]][["custom_model"]][[cond_id]]
  } else {
    model_string <- snakemake@config[["diffexp"]][["model"]]
  }
  organism <- snakemake@config[["organism"]]
  conf <- snakemake@config
} else {
  conf <- yaml::read_yaml("configs/VascAge_config.yaml")
  base_analysis_dir <- file.path(conf$dirs$BASE_ANALYSIS_DIR)

  cond_id <- names(conf$diffexp$contrasts)[1]
  comp_id <- names(conf$diffexp$contrasts[[cond_id]])[1]
  contrast_groups <- conf$diffexp$contrasts[[cond_id]][[comp_id]]
  coef_string <- paste0(
    cond_id, "_", contrast_groups[[1]], "_vs_",
    contrast_groups[[2]]
  )
  dds_path <- file.path(paste0(base_analysis_dir), "deseq2/all.rds")
  diffexp_tb_path <- file.path(
    paste0(base_analysis_dir),
    glue::glue("results/diffexp/{cond_id}/{comp_id}.diffexp.tsv")
  )
  fpkm_path <- file.path(base_analysis_dir, "fpkm/all.tsv")
  samp_map <- file.path(conf$samples)
  rld_path <- file.path(
    paste0(base_analysis_dir),
    "deseq2/rlog_transform.RDS.gz"
  )
  register(SerialParam())
  plot_path <- "./"
  lfc_threshold <- 0.5
  pvalue_threshold <- 0.05
  enrich_list <- file.path(
    base_analysis_dir,
    glue::glue("results/diffexp/{cond_id}/{comp_id}.gseares.RDS")
  )

  group_colors <- conf[["group_colors"]][[cond_id]] %>% unlist()
  organism <- conf$organism
  lfc_shrink <- FALSE
  model_string <- "~ condition"
}

dir.create(plot_path, recursive = TRUE)


dds_obj <- readRDS(dds_path)
rld <- readRDS(rld_path)
enrich_list <- readRDS(enrich_list)

fpkm <- readr::read_tsv(fpkm_path)

R Markdown Snakemake FeatureCounts From line 48 of scripts/DESeq2_analysis.Rmd

if (!is.null(conf$diffexp$custom_model[[cond_id]])) {
  model_string <- conf$diffexp$custom_model[[cond_id]]
  # Rerun deseq since it was intialized
  for (x in names(conf[["diffexp"]][["contrasts"]])) {
    colData(dds_obj)[, x] <- as.factor(colData(dds_obj)[, x])
  }
  colData(dds_obj)[, cond_id] <- as.factor(colData(dds_obj)[, cond_id])
  design(dds_obj) <- as.formula(model_string)
  dds_obj <- DESeq(dds_obj)
  dds_obj@colData[, cond_id] <- relevel(dds_obj@colData[, cond_id],
    ref = contrast_groups[2]
  )
}
if (lfc_shrink) {
  dds_obj@colData[, cond_id] <- relevel(dds_obj@colData[, cond_id],
    ref = contrast_groups[2]
  )
  design(dds_obj) <- as.formula(model_string)
  dds_obj <- DESeq(dds_obj)
  write(coef_string, file = stderr())
  res <- lfcShrink(dds_obj,
    coef = stringr::str_replace_all(coef_string, "-", "."),
    type = "apeglm"
  )
} else {
  res <- results(dds_obj, contrast = c(cond_id, contrast_groups))
}
DESeq2::plotMA(res)

R Markdown From line 144 of scripts/DESeq2_analysis.Rmd

sample_overview <- readr::read_tsv(samp_map)

deseq_tb <- readr::read_tsv(diffexp_tb_path,
  col_names = c(
    "gene_id",
    "baseMean",
    "logFoldChange",
    "lfcSE", "stat",
    "pvalue", "padj"
  ),
  skip = 1
)

sample_overview <- sample_overview %>%
  dplyr::filter(sample %in% rownames(colData(dds_obj)))

filer <- fpkm %>% dplyr::filter(gene %in% deseq_tb$gene_id)

joined_df <- join_tables(deseq_tb, filer)

joined_df <- joined_df %>%
  dplyr::mutate(overexpressed_in = ifelse(logFoldChange > 0,
    contrast_groups[1], contrast_groups[2]
  ))

mean_tb <- joined_df %>%
  dplyr::filter(padj < pvalue_threshold &
    abs(logFoldChange) > lfc_threshold) %>%
  RNAscripts::mean_tibble_from_mat(., "logFoldChange",
    contrast_groups = contrast_groups,
    s_map = sample_overview, cond_id = cond_id
  )
DT::datatable(mean_tb)

R Markdown From line 176 of scripts/DESeq2_analysis.Rmd

diff_exp_trans <- SummarizedExperiment::assay(rld)[joined_df$gene, ]
rownames(diff_exp_trans) <- joined_df$gname
write(cond_id, file = stderr())
df_col <- data.frame(SummarizedExperiment::colData(rld)[, c(cond_id)])

if (!is.null(group_colors)) {
  col_annotation <- HeatmapAnnotation(
    condition = dds_obj@colData[, cond_id],
    col = list(condition = group_colors)
  )
} else {
  col_annotation <- HeatmapAnnotation(condition = dds_obj@colData[, cond_id])
}
rownames(df_col) <- colnames(SummarizedExperiment::assay(rld))
colnames(df_col) <- cond_id
index_vec <- which(joined_df$padj < pvalue_threshold &
  abs(joined_df$logFoldChange) > lfc_threshold)
diff_exp_genes <- diff_exp_trans[index_vec, ]
small_joined_df <- joined_df[joined_df$padj < pvalue_threshold &
  abs(joined_df$logFoldChange) > lfc_threshold, ]
small_joined_df <- na.omit(small_joined_df)

if (nrow(small_joined_df) > 1) {
  scaled_diffexp <- diff_exp_genes
  diffexp_heatmap <- Heatmap(
    head(scaled_diffexp[order(
      abs(small_joined_df$logFoldChange * -log10(small_joined_df$padj)),
      decreasing = TRUE
    ), ], 50),
    top_annotation = col_annotation
  )
  save_cheatmap_svg(
    x = diffexp_heatmap,
    filename = file.path(
      plot_path,
      "diffexp_heatmap.svg"
    )
  )
  diffexp_heatmap
}

R Markdown From line 221 of scripts/DESeq2_analysis.Rmd

library(EnhancedVolcano)

volcano_plot <- EnhancedVolcano(as.data.frame(joined_df),
  lab = joined_df$gname,
  x = "logFoldChange",
  y = "padj",
  title = paste(contrast_groups, collapse = "_vs_"),
  pCutoff = 10e-3,
  ylab = bquote(~ -Log[10] ~ italic(Padj)), FCcutoff = lfc_threshold
)
ggsave(
  filename = file.path(plot_path, "Volcano_plot.svg"),
  plot = volcano_plot
)

volcano_plot

R Markdown EnhancedVolcano From line 268 of scripts/DESeq2_analysis.Rmd

#' Uniquify a vector
#'
#' @param x Input factor to check
#' @return corrected facotr value
#' @examples
#' NULL
uniquify <- function(x) {
  if (length(x) == 1) {
    x
  } else {
    sprintf("%s_%02d", x, seq_along(x))
  }
}

#' Disambiguate a vector
#'
#' Add numbered suffixes to redundant entries in a vector
#'
#' @param in_vector Vector to be disambiguated
#' @importFrom stats ave
#' @return The disambiguated vector.
#' @export
#'
#' @examples
#' NULL
#'
disambiguate_vector <- function(in_vector) {
  ave(in_vector, in_vector, FUN = uniquify)
}
diff_exp_heatmaps <- list()
diff_norm_heatmaps <- list()


if (!is.null(group_colors)) {
  col_annotation <-
    HeatmapAnnotation(
      condition = dds_obj@colData[dds_obj@colData[, cond_id] %in%
        contrast_groups, cond_id],
      col = list(condition = group_colors)
    )
} else {
  col_annotation <- HeatmapAnnotation(
    condition = dds_obj@colData[dds_obj@colData[, cond_id] %in%
      contrast_groups, cond_id]
  )
}

filtered_join_df <-
  joined_df[] %>% dplyr::filter(padj < pvalue_threshold &
    abs(logFoldChange) > lfc_threshold)
# Since it can happen that duplicate gene names appear -> uplicates are marked

filtered_join_df$gname <- disambiguate_vector(filtered_join_df$gname)

s_table <-
  filter_split_table(rld, contrast_groups, filtered_join_df,
    reorder_genes = FALSE,
    cond_id = cond_id
  )
r_means <- rowMeans(s_table)
r_sds <- apply(s_table, 1, sd)
s_table_centered <- s_table - r_means
s_table_norm <- t(scale(t(s_table)))



diff_exp_heatmaps <- ComplexHeatmap::Heatmap(
  s_table_centered,
  cluster_rows = TRUE,
  top_annotation = col_annotation,
  name = "Centered exp.",
  row_split = filtered_join_df$overexpressed_in,
  column_names_rot = 45,
  show_row_names = nrow(s_table_norm) < 100,
)
diff_norm_heatmaps <- ComplexHeatmap::Heatmap(
  s_table_norm,
  cluster_rows = TRUE,
  top_annotation = col_annotation,
  name = "z-score exp.",
  column_names_rot = 45,
  show_row_names = nrow(s_table_norm) < 100
)

if (nrow(filtered_join_df) > 0) {
  save_cheatmap_svg(diff_exp_heatmaps,
    file.path(plot_path, "split_heatmap.svg"),
    width = 14, height = 12
  )

  save_cheatmap_svg(diff_norm_heatmaps,
    file.path("standard_redblue_heatmap.svg"),
    width = 14, height = 12
  )

  plot(diff_exp_heatmaps)
  plot(diff_norm_heatmaps)
}

R Markdown From line 297 of scripts/DESeq2_analysis.Rmd

genereate_gsea_plots <- function(gsea_obj, gsea_name) {
  cat("\n")
  cat("### ", gsea_name , " \n")
  cat("\n")


}
msig_enrichment <- enrich_list$msig_enrichment

msig_gsea <- enrich_list$msig_gsea

kegg <- enrich_list$kegg

reactome_stuff <- enrich_list$reactome_stuff

ensembl_gene_list <- joined_df %>% dplyr::select(c(gene, stat))
gene_list <- joined_df %>% dplyr::select(c(gname, stat))

R Markdown From line 410 of scripts/DESeq2_analysis.Rmd

better_dotplot <- function(gset, c_groups = contrast_groups) {
  pos_gsea <- gset %>% dplyr::filter(NES > 0.5) %>% dplyr::arrange(desc(NES))

  if (nrow(pos_gsea) > 1) {
    dp_pos_nes <-
      dotplot(
        pos_gsea,
        size = "NES",
        color = "p.adjust",
        showCategory = 20,
        title = glue::glue("Pathways enriched in {contrast_groups[1]}")
      ) +
      scale_size(range = c(1, 7), limits = c(1, max(gset@result$NES)))
  } else {
    dp_pos_nes <- NULL
  }

  neg_gsea <- gset %>% dplyr::filter(NES < -0.5) %>% dplyr::arrange(NES)

  if (nrow(neg_gsea) > 1) {
    dp_neg_nes <-
      dotplot(
        neg_gsea,
        size = "NES",
        color = "p.adjust",
        showCategory = 20,
        title = glue::glue("Pathways enriched in {contrast_groups[2]}")
      ) +
      scale_size(range = c(7, 1), limits = c(min(gset@result$NES), -1))
  } else {
    dp_neg_nes <- NULL
  }

  return(list(dp_pos_nes, dp_neg_nes))
}
#' Title
#'
#' @param d_plot
#' @param p_group
#' @param p_path
#'
#' @return
#' @export
#'
#' @examples
save_dotplots <- function(d_plot, p_group, p_path = plot_path, gsea_type) {
  ggplot2::ggsave(
    filename = file.path(
      p_path,
      glue::glue("{p_group}_{gsea_type}_dplot.svg")
    ),
    d_plot, width = 10, height = 10
  )
}

org_db <- get_org_db(organism)

do_gseaplots <- function(enrich_res, gsea_name) {
  cat("\n")
  cat("### ", gsea_name , " \n")
  cat("\n")

  id_class <- conf[["gsea"]][[gsea_name]][["id_class"]]
  all_plots <- list()
  if (!is.null(enrich_res)) {
    if (nrow(enrich_res) > 1) {
      enrich_res <- clusterProfiler::setReadable(enrich_res, org_db, id_class)
      z <- enrichplot::upsetplot(enrich_res)
      all_plots$upset <- z
      if (conf[["gsea"]][[gsea_name]][["use_gsea"]]) {
        enrich_plots <- RNAscripts::plot_enrichment(enrich_res, 
          X = "ID", Y = "NES",
          pval = "p.adjust"
        )
        ggsave(filename = file.path(plot_path, paste0(
          contrast_groups[1], "_vs_",
          contrast_groups[2],
          "_", gsea_name, "_enrichplot.svg"
        )), plot = enrich_plots)
        print(enrich_plots)
        all_plots$enrichplots <- enrich_plots

        dplot <- better_dotplot(enrich_res, c_groups = contrast_groups)
        x<- purrr::map(dplot, print)
        purrr::map2(dplot, contrast_groups, save_dotplots, gsea_type = gsea_name )
        all_plots$dotplots <- dplot      
        cnetplot <- cnetplot(enrich_res,
                                foldChange = set_names(joined_df$logFoldChange, gene_list$gname),
                                node_label = "all",
                                layout = "nicely",
                                cex_label_category = 0.8,
                                cex_label_gene = 0.6
                                )
        print(cnetplot)

        ggsave(
          filename = file.path(plot_path, paste0(
            contrast_groups[1],
            "_vs_", contrast_groups[2], "_", gsea_name,
            "_cnet.svg"
          )),
          cnetplot, width = 16, height = 16
        )
        all_plots <- cnetplot
      } else {
        enrich_plots <- NULL
        all_plots$dotplots <- dotplot(msig_enrichment) 
        print(dotplot)
      }
    } else {
      all_plots <- NULL
    }
  }
  cat("\n \n")
  return(all_plots)
}


all_plots <- purrr::map2(enrich_list, names(enrich_list), do_gseaplots)

R Markdown From line 455 of scripts/DESeq2_analysis.Rmd

knitr::opts_chunk$set(
  echo = FALSE, warning = FALSE, message = FALSE,
  dev = "png",
  fig.width = 12, fig.height = 12
)

require(dplyr)
library(magrittr)
library(ComplexHeatmap)
library(pheatmap)

library(PCAtools)

library(clusterProfiler)

library(RNAscripts)


if (exists("snakemake")) {
  diffexp_tables_paths <- snakemake@input[["tables"]]
  contrast_groups <- snakemake@params[["contrast"]]
  contrast_list <- names(snakemake@params[["contrast"]])
  cond_id <- snakemake@wildcards[["condition"]]
  fpkm_path <- snakemake@input[["fpkm"]]
  dds_path <- snakemake@input[["dds_obj"]]
  rld_path <- snakemake@input[["rld"]]
  write(paste0(
    class(diffexp_tables_paths), length(diffexp_tables_paths),
    " and ",
    class(contrast_list), " ", length(contrast_list)
  ), file = stderr())
  threads <- snakemake@threads
  lfc_threshold <- snakemake@config[["diffexp"]][["LFC_threshold"]]
  pvalue_threshold <- snakemake@config[["diffexp"]][["pval_threshold"]]
  group_colors <- snakemake@config[["group_colors"]][[cond_id]] %>% unlist()
  run_pert <- snakemake@config[["perform_perturbation"]]
} else {
  conf <- yaml::read_yaml("../../configs/VascAge_config.yaml")
  SARIFA_DIR <- "/Users/heyechri/Documents/software/heyer/multicondition-deseq2-enrichment"
  BASE_ANALYSIS_DIR <- file.path(conf$dirs$BASE_ANALYSIS_DIR)
  cond_id <- names(conf$diffexp$contrasts)[1]
  diffexp_tables_paths <- as.list(file.path(
    BASE_ANALYSIS_DIR,
    glue::glue("results/diffexp/{cond_id}/{names(conf$diffexp$contrasts[[cond_id]])}.diffexp.tsv")
  ))
  contrast_list <- names(conf$diffexp$contrasts$SARIFA)

  fpkm_path <- file.path(BASE_ANALYSIS_DIR, "fpkm/all.tsv")
  dds_path <- file.path(BASE_ANALYSIS_DIR, "deseq2/all.rds")
  rld_path <- file.path(BASE_ANALYSIS_DIR, "deseq2/rlog_transform.RDS.gz")
  threads <- 2
  lfc_threshold <- 0.5
  pvalue_threshold <- 0.05
  run_pert <- FALSE

  group_colors <- conf[["group_colors"]][[cond_id]] %>% unlist()
}

R Markdown Snakemake dplyr magrittr PCAtools ComplexHeatmap clusterProfiler From line 18 of scripts/DEseq2_cohort.Rmd

## READ all DIFFexp tables
diff_exp_tables <- purrr::map(diffexp_tables_paths,
  readr::read_tsv,
  col_names = c(
    "gene_id",
    "baseMean",
    "logFoldChange",
    "lfcSE",
    "stat",
    "pvalue",
    "padj"
  ), skip = 1
)
names(diff_exp_tables) <- contrast_list
all_ensembl <- diff_exp_tables[[1]][, 1]
### Set Rownames for data frame
diff_exp_frames <- purrr::map(diff_exp_tables, function(x) {
  gene_ids <- x$gene_id
  diff_exp_fr <- as.data.frame(x[, -1])
  rownames(diff_exp_fr) <- gene_ids
  diff_exp_fr
})
### Read FPKM Table
fpkm <- readr::read_tsv(fpkm_path)
all_genes <- fpkm %>%
  dplyr::filter(gene %in% all_ensembl$gene_id) %>%
  dplyr::select(gname)

Big_tables <- purrr::map(diff_exp_tables, join_tables, fpkm = fpkm)

R Markdown From line 80 of scripts/DEseq2_cohort.Rmd

build_comb_gene_set <- function(comb_mat, comb_mat_order) {
  combination_gene_sets <- list()
  for (c_name in ComplexHeatmap::comb_name(comb_mat)) {
    gnames <- ComplexHeatmap::extract_comb(m = comb_mat, comb_name = c_name)

    groups_index <- stringr::str_split(c_name, "") %>% unlist()
    group_names <- ComplexHeatmap::set_name(comb_mat)[as.logical(as.numeric(groups_index))]
    merged_group_name <- paste(group_names, collapse = " + ")



    combination_gene_sets[[merged_group_name]] <- tibble::tibble(
      gene_names = gnames,
      lfc = rep(0, length(gnames))
    )
  }
  if (length(combination_gene_sets) > 30) {
    combination_gene_sets <- combination_gene_sets[c(match(names(comb_mat_order), ComplexHeatmap::comb_name(comb_mat))[1:30])]
  } else {
    combination_gene_sets <- combination_gene_sets
  }
  combination_gene_sets
}
group_name_combinations <- c()

gene_names <- purrr::map(Big_tables, function(x) {
  x %>% dplyr::pull(gname)
})

sig_gene_map <- purrr::map(Big_tables, function(x) {
  x %>%
    dplyr::filter(padj < pvalue_threshold &
      abs(logFoldChange) > lfc_threshold) %>%
    dplyr::select(gene, gname)
})

sig_gene_names <- purrr::map(sig_gene_map, function(x) {
  dplyr::pull(x, gname)
})
if (any(purrr::map(sig_gene_map, nrow) > 0)) {
  gene_name_set <- ComplexHeatmap::list_to_matrix(sig_gene_names)
  comb_mat <- ComplexHeatmap::make_comb_mat(gene_name_set)
  comb_mat_order <- ComplexHeatmap::comb_size(comb_mat) %>% sort(decreasing = T)
  combination_gene_sets <- build_comb_gene_set(comb_mat = comb_mat, comb_mat_order = comb_mat_order)
  H_res <- run_msig_enricher(combination_gene_sets, "H", GSEA = F)
  H_res <- H_res[!(purrr::map_lgl(H_res, is.null))]
}
if (length(sig_gene_names) > 1) {
  test <- UpSetR::upset(UpSetR::fromList(sig_gene_names),
    nsets = length(sig_gene_names),
    nintersects = 30,
    order.by = c("degree", "freq"),
    decreasing = c(T, T)
  )
} else {
  test <- NULL
}
# if (!run_pert) {
#  small_comb <- comb_mat[comb_size(comb_mat) >= 20]
#  UpSet(small_comb, comb_order = order(comb_size(small_comb),
#                                       decreasing = T), )
# }

test

R Markdown GSEA From line 122 of scripts/DEseq2_cohort.Rmd

#' Create dynamic table from significant gene names
#'
#' @param sig_gene_names import list of genes to put in matrix
#'
#' @return DT::datatable
#' @export
#'
#' @examples
create_overlap_matrix <- function(sig_gene_names) {
  gene_name_index <- unlist(sig_gene_names) %>%
    unique() %>%
    sort()

  #' Takes two character vecotrs (info and index) and checks which index fields are present in info
  #'
  #' @return returns a vecor of length(index) with 1 or 0 depending on True or False
  check_against_index <- function(info, index) {
    return(as.numeric(index %in% info))
  }

  gene_matrix <- sapply(sig_gene_names, check_against_index, index = gene_name_index)
  rownames(gene_matrix) <- gene_name_index


  gene_matrix <- cbind(gene_matrix, total = rowSums(gene_matrix))

  ordered_matrix <- gene_matrix[order(gene_matrix[, "total"], decreasing = T), ]

  DT::datatable(ordered_matrix)
}

ifelse(all(purrr::map(
  sig_gene_names,
  is.null
)),
create_overlap_matrix(sig_gene_names),
print("No enrichments present")
)

R Markdown From line 189 of scripts/DEseq2_cohort.Rmd

plot_gene_sets <- function(gset_res, plot_title) {
  print(plot_title)
  if (!is.null(gset_res)) {
    if (any(gset_res@result$p.adjust < 0.05)) {
      cnet_plot <- enrichplot::cnetplot(gset_res) + ggplot2::ggtitle(plot_title)
      return(cnet_plot)
    }
  } else {
    return(NULL)
  }
}
if (exists("H_res")) {
  cnet_plots <- purrr::pmap(list(H_res, names(H_res)), plot_gene_sets)
}

R Markdown From line 238 of scripts/DEseq2_cohort.Rmd

H_barplots <- purrr::map(H_res, barplot)
H_heatmaps <- purrr::map(H_res, clusterProfiler::heatplot, showCategory = 10)


for (set_n in names(H_res)) {
  if (any(H_res[[set_n]]@result$p.adjust < 0.05)) {
    cat("### ", set_n, "H set \n")
    y <- H_barplots[[set_n]]
    ifelse(nrow(y$data) > 0, plot(y), "")
    ifelse(nrow(H_res[[set_n]]) > 0,
      plot(dotplot(H_res[[set_n]], orderBy = "x")),
      ""
    )
    cat("\n")
    cat("\n")
    ifelse(nrow(cnet_plots[[set_n]]$data) > 5,
      plot(cnet_plots[[set_n]]), ""
    )
    cat("\n \n")
  }
}

R Markdown From line 255 of scripts/DEseq2_cohort.Rmd

dds_obj <- readRDS(dds_path)

rld <- readRDS(rld_path)

sig_transcripts <- purrr::map(sig_gene_map, function(x) {
  dplyr::pull(x, gene)
}) %>%
  unlist() %>%
  unique()

# select <- order(var(counts(dds_obj,normalized=TRUE)),
#                decreasing=TRUE)[1:400]
df <- data.frame(SummarizedExperiment::colData(dds_obj)[, c(cond_id)])
rownames(df) <- colnames(SummarizedExperiment::assay(rld))
colnames(df) <- cond_id

annotation_cols <- list(cond_id = group_colors)

ha <- ComplexHeatmap::HeatmapAnnotation(condition = rld@colData[, cond_id], col = annotation_cols)
normalized_expression <- SummarizedExperiment::assay(rld[sig_transcripts, ]) - rowMeans(SummarizedExperiment::assay(rld[sig_transcripts, ]))
ComplexHeatmap::Heatmap(normalized_expression,
  cluster_rows = TRUE, show_row_names = FALSE,
  # col = viridis::viridis(100),
  top_annotation = ha,
  name = "Normalized gene expression in differentially expressed genes",
  #  km = 4
)

R Markdown From line 288 of scripts/DEseq2_cohort.Rmd

library("RColorBrewer")
# vsd <- DESeq2::vst(dds_obj)
sampleDists <- dist(t(SummarizedExperiment::assay(rld)), )
sampleDistMatrix <- as.matrix(sampleDists)
# rownames(sampleDistMatrix) <- paste(vsd$condition)
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
ComplexHeatmap::pheatmap(sampleDistMatrix,
  clustering_distance_rows = sampleDists,
  clustering_distance_cols = sampleDists,
  col = colors
)

R Markdown RColorBrewer From line 326 of scripts/DEseq2_cohort.Rmd

assay_data <- SummarizedExperiment::assay(rld)
metadata <- SummarizedExperiment::colData(rld)

pca_data <- PCAtools::pca(assay_data, metadata = metadata, removeVar = 0.1)
scree <- screeplot(pca_data)
scree

R Markdown From line 347 of scripts/DEseq2_cohort.Rmd

PCAtools::biplot(pca_data, colby = cond_id, legendPosition = "right")
plot <- PCAtools::pairsplot(pca_data,
  components = PCAtools::getComponents(pca_data, c(1:5)),
  triangle = F,
  hline = 0, vline = 0,
  pointSize = 0.8,
  gridlines.major = FALSE, gridlines.minor = FALSE,
  colby = cond_id,
  title = "Pairs plot", plotaxes = T,
  margingaps = ggplot2::unit(c(-0.01, -0.01, -0.01, -0.01), "cm"),
  legendPosition = "none"
)
plot

R Markdown From line 362 of scripts/DEseq2_cohort.Rmd

library("DESeq2")
parallel <- FALSE
if (snakemake@threads > 1) {
  library("BiocParallel")
  # setup parallelization
  register(MulticoreParam(snakemake@threads))
  parallel <- TRUE
}
if (!exists("snakemake")) {
  library(magrittr)
  the_yaml <- yaml::read_yaml("./configs/VascAge_config.yaml")
  BASE_DIR <- the_yaml$dirs$BASE_ANALYSIS_DIR
  cts <- file.path(BASE_DIR, "counts/all.tsv") %>%
    read.table(header = T, row.names = "gene", check.names = F)
  coldata <- read.table("./data/Vasc_age2020/Vascage_samples.tsv",
    header = TRUE,
    row.names = "sample",
    check.names = FALSE,
    stringsAsFactors = TRUE
  )
  # coldata %>% dplyr::filter(cell_type == "tumor") ->coldata
  # cts <- cts[,rownames(small_coldata)]
  snakemake_conf <- yaml::read_yaml("./configs/VascAge_config.yaml")
  all_conditions <- names(snakemake_conf$diffexp$contrasts)
}
all_conditions <- names(snakemake@config$diffexp$contrasts)


# colData and countData must have the same sample order, but this is ensured
# by the way we create the count matrix
cts <- read.table(snakemake@input[["counts"]],
  header = TRUE,
  row.names = "gene",
  check.names = FALSE
)
coldata <- read.table(snakemake@params[["samples"]],
  header = TRUE,
  row.names = "sample", check.names = FALSE
)


## Reorder coldata rows to match cts col order (beacause deseq things)
if (!all(colnames(cts) == rownames(coldata))) {
  sample_ids <- colnames(cts)
  coldata <- coldata[sample_ids, ]
}
# Remove NAs from Data (not supported)
if (any(is.na(coldata[, c(all_conditions)]))) {
  na_index <- apply(coldata, 1, function(x) {
    any(is.na(x))
  })
  coldata <- coldata[!na_index, ]
  cts <- cts[, rownames(coldata)]
}
all_conditions <- names(snakemake@config$diffexp$contrasts)
for (x in all_conditions) {
  coldata[, x] <- as.factor(coldata[, x])
}

dds <- DESeqDataSetFromMatrix(
  countData = cts,
  colData = coldata,
  #  design = as.formula("~condition"))
  design = as.formula(snakemake@params[["model"]]),
)




# remove genes defined in config
excluded_genes <- snakemake@config[["excluded_genes"]]
# print(excluded_genes)
if (length(excluded_genes) > 0) {
  ex_index <- purrr::map_int(excluded_genes,
    grep, rownames(dds),
    value = F
  )
  if (length(ex_index) > 0) {
    dds <- dds[-ex_index, ]
  }
}
# remove uninformative rows
dds <- dds[rowSums(counts(dds)) > ncol(dds) / 2, ]

# normalization and preprocessing
dds <- DESeq(dds,
  parallel = parallel
)
cpm_filter <- apply(edgeR::cpm(counts(dds, normalized = T)), 1, function(x) {
  if(length(which(x > 0.5)) > 0.2 * length(x)) {
    val <- 1
  } else {
    val <- 0 
  }
  as.logical(val)
})
dds <- dds[cpm_filter,]
saveRDS(dds, file = snakemake@output[[1]])

R Snakemake DESeq2 magrittr BiocParallel From line 5 of scripts/deseq2-init.R

log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

library("DESeq2")

parallel <- FALSE
if (snakemake@threads > 1) {
  library("BiocParallel")
  # setup parallelization
  register(MulticoreParam(snakemake@threads))
  parallel <- TRUE
}

lfc_shrink <- snakemake@config[["diffexp"]][["shrink_lfc"]]
dds <- readRDS(snakemake@input[[1]])
cond_id <- snakemake@wildcards[["condition"]]
contrast_groups <- snakemake@params[["contrast"]]
contrast <- c(cond_id, contrast_groups)
print(contrast)

coef_string <- paste0(cond_id, "_", contrast_groups[[1]], "_vs_", contrast_groups[[2]])
print(coef_string)
if (!is.null(snakemake@config$diffexp$custom_model[[contrast[1]]])) {
  # Rerun deseq since it was intialized
  for (x in names(snakemake@config[["diffexp"]][["contrasts"]])) {
    colData(dds)[, x] <- as.factor(colData(dds)[, x])
  }

  colData(dds)[, contrast[1]] <- as.factor(colData(dds)[, contrast[1]])
  design(dds) <- as.formula(snakemake@config$diffexp$custom_model[[contrast[1]]])
  dds <- DESeq(dds)
}

zero_index <- apply(counts(dds)[, colData(dds)[, cond_id] %in% contrast_groups], 1, function(x) {
  length(which(x > 0))
})
# dds <- dds[zero_index > ncol(dds[,colData(dds)[,cond_id] %in% contrast_groups])/2,]
res <- results(dds, contrast = contrast, parallel = parallel)
resstat <- res$stat
# shrink fold changes for lowly expressed genes
if (lfc_shrink) {
  dds@colData[, cond_id] <- relevel(dds@colData[, cond_id], ref = contrast_groups[2])
  dds <- DESeq(dds, parallel = parallel)
  res_new <- lfcShrink(dds, coef = stringr::str_replace_all(coef_string, "-", "."), type = "apeglm")
  print(head(res_new))
  # sort by p-value
  res_new$stat <- resstat
  res_new <- res_new[, c("baseMean", "log2FoldChange", "lfcSE", "stat", "pvalue", "padj")]
  res_new <- res_new[order(res_new$padj), ]
}
# store results
pdf(snakemake@output[["ma_plot"]])
plotMA(res, ylim = c(-2, 2))
if (lfc_shrink) {
  plotMA(res_new, ylim = c(-2, 2))
}
dev.off()

if (lfc_shrink) {
  res <- res_new
}
#


write.table(as.data.frame(res), sep = "\t", quote = FALSE, file = snakemake@output[["table"]])

R DESeq2 BiocParallel From line 1 of scripts/deseq2.R

knitr::opts_chunk$set(
  echo = FALSE, warning = FALSE, message = FALSE, dev = "png",
  fig.width = 12, fig.height = 12
)
if (!require(dorothea)) {
  BiocManager::install("dorothea")
}
# library(dorothea)
# library(bcellViper)
library(dplyr)
library(viper)
library(decoupleR)
library(RColorBrewer)
if (!require(RNAscripts)) {
  devtools::install("./scripts/RNAscripts", upgrade = "never")
}
library(RNAscripts)
library(ggplot2)
library(readr)
library(ComplexHeatmap)
library(DESeq2)
library(purrr)
library(magrittr)
if (exists("snakemake")) {
  ## Input Files
  dds_path <- snakemake@input[["dds_path"]]
  fpkm_path <- snakemake@input[["fpkm_path"]]
  ### Output
  dorothea_fp <- snakemake@input[["dorothea_fp"]]
  # Params
  cond_id <- names(snakemake@config[["diffexp"]][["contrasts"]])
  samp_map <- snakemake@params[["samp_map"]]
  plot_path <- snakemake@params[["plot_path"]]
  comp_groups <- snakemake@config[["signif_comp"]]
  color_scheme <- purrr::map(snakemake@config[["group_colors"]], unlist)
  organism <- snakemake@config[["organism"]]
} else {
  the_yaml <- yaml::read_yaml("../../config/STAD.yaml")

  BASE_ANALYSIS_DIR <- c(the_yaml$dirs$BASE_ANALYSIS_DIR)
  #  comp_id <- "YECplusA-vs-YEC"

  #  contrast_groups <- c("Young-EC_apelin", "Young-EC")
  dds_path <- file.path(paste0(BASE_ANALYSIS_DIR), "deseq2/all.rds")
  fpkm_path <- file.path(BASE_ANALYSIS_DIR, "fpkm/all.tsv")
  samp_map <- the_yaml$samples
  plot_path <- "./"
  cond_id <- names(the_yaml$diffexp$contrasts)
  comp_groups <- the_yaml$comp_groups
  color_scheme <- purrr::map(the_yaml$group_colors, unlist)
  organism <- "Homo sapiens"
}
if (!dir.exists(plot_path)) {
  dir.create(plot_path)
}

R Markdown Snakemake ggplot2 dplyr readr magrittr RColorBrewer purrr ComplexHeatmap viper decoupleR From line 9 of scripts/dorothea.Rmd

dds_file <- readRDS(dds_path)

Normalized_counts <- getVarianceStabilizedData(dds_file)
# Normalized_counts <- assay(vst(dds_file, blind = F))
fpkm <- read_tsv(fpkm_path)
filer <- fpkm %>%
  dplyr::filter(gene %in% rownames(Normalized_counts)) %>%
  dplyr::filter(!duplicated(gname))

# joined_df <- join_tables(diffexp_tb,filer) %>% dplyr::filter(!duplicated(gname))
Normalized_counts <- Normalized_counts[filer$gene, ]

rownames(Normalized_counts) <- filer$gname

# %>% dplyr::select(gname, stat) %>% dplyr::filter(!is.na(stat)) %>%
# column_to_rownames(var = "gname") %>% as.matrix() -> diffexp_matrix
expression_matrix <- Normalized_counts

# regulons <- dorothea_mm %>% filter(confidence %in% c("A", "B", "C"))
sample_overview <- colData(dds_file) %>% as_tibble(rownames = "sample")

R Markdown From line 74 of scripts/dorothea.Rmd

organism_name <- RNAscripts::get_organism_omnipath_name(organism)
net <- get_dorothea(organism = organism_name, levels = c("A", "B", "C"))

viper_net <- net
colnames(viper_net) <- colnames(dorothea_mm)
tf_activities <- decoupleR::run_viper(expression_matrix, net,
  minsize = 4
)
tf_activities %>%
  dplyr::filter(statistic == "viper") %>%
  tidyr::pivot_wider(
    id_cols = "condition", names_from = "source",
    values_from = "score"
  ) %>%
  tibble::column_to_rownames("condition") %>%
  as.matrix() %>%
  t() -> tf_activities


plot_doro_hm <- function(dds, cond, color_list, p_data) {
  anno_vec <- dds_file@colData %>%
    as.data.frame() %>%
    pull(!!cond)


  color_vec <- color_list[[cond]]
  if (!is.null(color_vec)) {
    ha <- HeatmapAnnotation(
      group = anno_vec,
      col = list(
        group =
          as_vector(color_vec)
      )
    )
  } else {
    ha <- HeatmapAnnotation(group = anno_vec)
  }

  ch <- Heatmap(p_data,
    top_annotation = ha,
    clustering_distance_columns = "euclidean",
    clustering_method_columns = "average",
    show_row_names = FALSE,
    show_column_names = F,
    row_names_gp = gpar(fontsize = 8),
    column_names_rot = 70
  )
  save_cheatmap_svg(ch, file.path(plot_path, glue::glue("progeny_heatmap_{cond}.svg")))

  ch
}

if (!is.null(color_scheme)) {
  purrr::map(cond_id, plot_doro_hm, dds = dds_file, p_data = tf_activities, color_list = color_scheme)
} else {
  purrr::map(cond_id, plot_doro_hm, dds = dds_file, p_data = tf_activities, color_list = color_scheme)
}

R Markdown From line 102 of scripts/dorothea.Rmd

plot_doro_plots <- function(tf_activities, dds_file, cond) {
  mean_tf_activities <- mean_tibble_from_mat(
    mat = tf_activities,
    contrast_groups = colData(dds_file)[, cond],
    s_map = sample_overview, cond_id = cond
  )
  mean_tf_activities["TF"] <- rownames(tf_activities)
  plot_list <- list()
  for (c_group in unique(sample_overview %>% pull(!!cond))) {
    plot_table <- mean_tf_activities %>%
      dplyr::select(TF, c_group) %>%
      dplyr::arrange(!!sym(c_group))
    histo <- ggplot(plot_table, aes(!!sym(c_group))) +
      geom_density() +
      ggtitle(c_group) +
      theme_bw() +
      xlab("NES")

    plot_table <-
      top_n(
        x = plot_table,
        n = 25,
        wt = abs(!!sym(c_group))
      )
    plot_table <-
      plot_table %>% mutate(color = ifelse(!!sym(c_group) > 0, "green", "red"))

    enrich_plot <-
      ggplot(plot_table, aes(x = TF, y = !!sym(c_group))) +
      scale_x_discrete(limits = plot_table$TF) +
      geom_point(size = 3, aes(col = color)) +
      geom_segment(aes(
        x = TF,
        xend = TF,
        y = 0,
        yend = !!sym(c_group)
      )) +
      coord_flip() +
      theme_bw() +
      ylab("Normalized Enrichment Score (NES)") +
      xlab("TF") +
      ggtitle(c_group)

    NES_plot <-
      ggplot(plot_table, aes(x = reorder(TF, !!sym(c_group)), y = !!sym(c_group))) +
      geom_bar(aes(fill = !!sym(c_group)), stat = "identity") +
      scale_fill_gradient2(
        low = "darkblue",
        high = "indianred",
        mid = "whitesmoke",
        midpoint = 0
      ) +
      theme_minimal() +
      theme(
        axis.title = element_text(face = "bold", size = 12),
        axis.text.x =
          element_text(
            angle = 45,
            hjust = 1,
            size = 10,
            face = "bold"
          ),
        axis.text.y = element_text(size = 10, face = "bold"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        legend.position = "none"
      ) +
      ylab("NES") +
      xlab("Pathways") +
      ggtitle(c_group)
    plot(histo)
    plot(enrich_plot)
    plot(NES_plot)
  }
}

purrr::map(cond_id, plot_doro_plots,
  tf_activities = tf_activities,
  dds_file = dds_file
)

R Markdown ggplot2 From line 165 of scripts/dorothea.Rmd

sample_acts <- run_wmean(
  mat = expression_matrix, network = net, .source = "source", .target = "target",
  .mor = "mor", times = 100, minsize = 5
)

sample_acts_mat <- sample_acts %>%
  dplyr::filter(statistic == "norm_wmean") %>%
  tidyr::pivot_wider(
    id_cols = "condition", names_from = "source",
    values_from = "score"
  ) %>%
  tibble::column_to_rownames("condition") %>%
  as.matrix() %>%
  t()


if (!is.null(color_scheme)) {
  purrr::map(cond_id, plot_doro_hm,
    dds = dds_file,
    p_data = t(scale(t(sample_acts_mat), scale = F)), color_list = color_scheme
  )
} else {
  purrr::map(cond_id, plot_doro_hm, dds = dds_file, p_data = t(scale(t(sample_acts_mat))), color_list = color_scheme)
}

R Markdown From line 251 of scripts/dorothea.Rmd

purrr::map(cond_id, plot_doro_plots,
  tf_activities = t(scale(t(sample_acts_mat))),
  dds_file = dds_file
)

R Markdown From line 279 of scripts/dorothea.Rmd

library(openxlsx)
library(magrittr)
library(dplyr)
library(tidyr)
library(glue)
if (!require(RNAscripts)) {
  devtools::install("./scripts/RNAscripts", upgrade = "never")
}
library(RNAscripts)
if (exists("snakemake")) {
  diffexp_tb_path <- snakemake@input[["table"]]
  cond_id <- snakemake@wildcards[["condition"]]
  fpkm_path <- snakemake@input[["fpkm"]]
  samp_map <- snakemake@params[["samp_map"]]
  contrast_groups <- snakemake@config[["diffexp"]][["contrasts"]][[cond_id]]
  contrast_names <- snakemake@params[["contrast_groups"]]
  names(contrast_groups) <- names(contrast_names)
  output_path <- snakemake@output[["outpath"]]
  print(c(diffexp_tb_path, fpkm_path, samp_map, contrast_groups, contrast_names))
  pval_threshold <- snakemake@config[["diffexp"]][["pval_threshold"]]
  lfc_threshold <- snakemake@config[["diffexp"]][["LFC_threshold"]]
} else {
  BASE_ANALYSIS_DIR <- "/omics/odcf/analysis/OE0228_projects/VascularAging/rna_sequencing/apelin_exp_test"

  test_config <- yaml::read_yaml("/desktop-home/heyer/projects/Vascular_Aging/RNAseq/multicondition-deseq2-enrichment/configs/VascAge_Apelin_config.yaml")
  cond_id <- "condition"
  contrast_groups <- test_config$diffexp$contrasts$condition
  contrast_names <- test_config$diffexp$contrasts$condition
  diffexp_tb_path <- as.list(file.path(BASE_ANALYSIS_DIR, glue::glue("results/diffexp/condition/{names(contrast_groups)}.diffexp.tsv")))

  names(diffexp_tb_path) <- names(contrast_groups)
  lfc_threshold <- 1
  pval_threshold <- 0.05

  fpkm_path <- file.path(BASE_ANALYSIS_DIR, "fpkm/all.tsv")
  samp_map <- "/desktop-home/heyer/projects/Vascular_Aging/RNAseq/rna-seq-star-deseq2/data/apelin_2020/VascAge_samples_apelin.tsv"
  output_path <- "/omics/odcf/analysis/OE0228_projects/VascularAging/rna_sequencing/apelin_exp_test/results/diffexp/condition/excel_tables/diff_exp_man.xlsx"
}


print("read samples")
sample_overview <- readr::read_tsv(samp_map)


col_name <- c("gene", "gname")

print("read fpkm table")
fpkm <- readr::read_tsv(fpkm_path)
sample_overview <- sample_overview %>%
  dplyr::filter(sample %in% colnames(fpkm))
build_output_table <- function(diff_exp_table, c_group, fpkm, col_name) {
  print(c_group)
  print(col_name)
  relevant_samples <- sample_overview %>%
    dplyr::filter(!!sym(cond_id) %in% c_group) %>%
    pull(sample)
  print(relevant_samples)
  deseq2_table <- readr::read_tsv(diff_exp_table, col_names = c(
    "gene_id", "baseMean", "logFoldChange", "lfcSE", "stat",
    "pvalue", "padj"
  ), skip = 1)
  print("read deseq")

  # print(colnames(fpkm))
  fpkm_data <- fpkm %>%
    dplyr::filter(gene %in% deseq2_table$gene_id) %>%
    dplyr::select(col_name, relevant_samples)
  colnames(fpkm_data)[-c(1:2)] <- sample_overview %>%
    dplyr::filter(sample %in% colnames(fpkm_data)[-c(1:2)]) %>%
    pull(sample)

  print(names(fpkm_data))
  print(names(deseq2_table))
  joined_df <- join_tables(deseq2_table, fpkm_data)


  joined_df <- joined_df %>%
    dplyr::mutate(overexpressed_in = ifelse(logFoldChange > 0, c_group[1], c_group[2]))
  output_tb <- joined_df %>%
    dplyr::filter(padj < pval_threshold & abs(logFoldChange) > lfc_threshold)
  # Shorten names to less than 31 chars or excel gets angry

  output_tb
}

output_list <- purrr::map2(diffexp_tb_path, contrast_groups, build_output_table, fpkm = fpkm, col_name = col_name)
print(contrast_names)
names(output_list) <- purrr::map_chr(names(contrast_names), function(comp_name) {
  comp_name <- ifelse(nchar(comp_name) > 31, stringr::str_trunc(comp_name, 30, "right"), comp_name)
  comp_name
})
print(names(output_list))
# names(output_list) <- names(contrast_names)
openxlsx::write.xlsx(x = output_list, file = output_path, )

R Snakemake dplyr tidyr magrittr GLUE openxlsx From line 1 of scripts/export_diffexp_xlsx.R

import gffutils

db = gffutils.create_db(
    snakemake.input[0],
    dbfn=snakemake.output.db,
    force=True,
    keep_order=True,
    merge_strategy="merge",
    sort_attribute_values=True,
    disable_infer_genes=True,
    disable_infer_transcripts=True,
)

with open(snakemake.output.bed, "w") as outfileobj:
    for tx in db.features_of_type("transcript", order_by="start"):
        bed = [s.strip() for s in db.bed12(tx).split("\t")]
        bed[3] = tx.id
        outfileobj.write("{}\n".format("\t".join(bed)))

Python From line 1 of scripts/gtf2bed.py

log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

library("DESeq2")

# load deseq2 data
dds <- readRDS(snakemake@input[[1]])

# obtain normalized counts counts <- assay(dds)
svg(snakemake@output[[1]])
plotPCA(dds, intgroup = snakemake@params[["pca_labels"]])
dev.off()

R DESeq2 PCAtools From line 1 of scripts/plot-pca.R

library(tidyverse)
library(DESeq2)
deseq_obj <- readRDS(snakemake@input[["dds_obj"]])

if (ncol(deseq_obj) < 50) {
  rld <- DESeq2::rlog(deseq_obj, blind = FALSE)
} else {
  rld <- DESeq2::vst(deseq_obj, blind = FALSE)
}

saveRDS(rld, snakemake@output[["rld"]])

norm_counts <- assay(rld, withDimnames = T) %>%
  as_tibble(rownames = "gene")
og_gene_names <- norm_counts$gene
norm_counts$gene <- stringr::str_extract(norm_counts$gene, pattern = "^ENS[A-Z0-9]*")
# this variable holds a mirror name until useEnsembl succeeds ('www' is last, because of very frequent 'Internal Server
# Error's)

species <- RNAscripts::get_organism_ensembl_name(snakemake@config[["organism"]])
stable_get_bm <- function(species) {
  mart <- "useast"
  rounds <- 0
  while (class(mart)[[1]] != "Mart") {
    mart <- tryCatch(
      {
        # done here, because error function does not modify outer scope variables, I tried
        if (mart == "www") {
          rounds <- rounds + 1
        }
        # equivalent to useMart, but you can choose the mirror instead of specifying a host
        biomaRt::useEnsembl(biomart = "ENSEMBL_MART_ENSEMBL", dataset = glue::glue("{species}_gene_ensembl"), mirror = mart)
      },
      error = function(e) {
        # change or make configurable if you want more or less rounds of tries of all the mirrors
        if (rounds >= 3) {
          stop()
        }
        # hop to next mirror
        mart <- switch(mart,
          useast = "uswest",
          uswest = "asia",
          asia = "www",
          www = {
            # wait before starting another round through the mirrors, hoping that intermittent problems disappear
            Sys.sleep(30)
            "useast"
          }
        )
      }
    )
  }
  mart
}

mart <- stable_get_bm(species)
# df <- read.table(snakemake@input']], sep='\t', header=1)
gene_name_type <- snakemake@config[["gene_name_type"]]
if (gene_name_type == "ENSEMBL") {
  g2g <- biomaRt::getBM(attributes = c("ensembl_gene_id", "external_gene_name"), filters = "ensembl_gene_id", values = stringr::str_extract(norm_counts$gene,
    pattern = "^ENS[A-Z0-9]*"
  ), mart = mart, )
  colnames(g2g) <- c("gene", "gname")
  norm_counts$short_gene_names <- stringr::str_extract(norm_counts$gene, pattern = "^ENS[A-Z0-9]*")
  annotated <- dplyr::left_join(norm_counts, g2g, by = c(short_gene_names = "gene")) %>%
    dplyr::select(-short_gene_names)
  annotated$gname <- ifelse(annotated$gname == "" | is.na(annotated$gname), annotated$gene, annotated$gname)
} else if (gene_name_type == "ENTREZ_ID") {
  stop("to be implemented")
} else if (gene_name_type == "HGNC") {
  annotated <- norm_counts
  annotated$gname <- annotated$gene
} else {
  stop("non valid gene name type")
}
# annotated$gene <- ifelse(annotated$external_gene_name == '', annotated$gene, annotated$external_gene_name)
annotated$gene <- og_gene_names

readr::write_tsv(annotated, snakemake@output[["fpkm"]])

R tidyverse From line 1 of scripts/rlog_transform.R

contrast_groups <- ifelse(exists("snakemake"),
  snakemake@wildcards[["contrast"]],
  ""
)

R Markdown Snakemake From line 9 of RMD_scripts/carnival_downstream.Rmd

library(CARNIVAL)
library(readr)
library(piano)
library(dplyr)
library(ggplot2)
library(tibble)
library(tidyr)
library(dplyr)
library(scales)
library(plyr)
library(GSEABase)
library(network)
library(reshape2)
library(cowplot)
library(pheatmap)
library(ggraph)
library(tidygraph)
library(snowfall)

R Markdown ggplot2 dplyr tidyr readr cowplot reshape2 tibble scales plyr ggraph GSEABase tidygraph CARNIVAL network PIANO snowfall From line 20 of RMD_scripts/carnival_downstream.Rmd

if (exists("snakemake")) {
  carnival_path <- snakemake@input[["carnival_obj"]]
  tt_basepath <- snakemake@params[["tutorial_source_path"]]
  comp_group <- snakemake@wildcards[["contrast"]]
  out_file <- snakemake@output[[1]]
  organism <- snakemake@config[["organism"]]
} else {
  yaml <- yaml::read_yaml("../../../configs/VascAge_cre_config.yaml")
  comp_group <- names(yaml$diffexp$contrasts$condition)[1]
  base_path <- yaml$dirs$BASE_ANALYSIS_DIR
  carnival_path <- file.path(
    base_path, "results/inversecarnival",
    glue::glue("{comp_group}_carnival_res.RDS.gz")
  )

  tt_basepath <- "../transcriptutorial"
  out_file <- ""
}
source(file.path(tt_basepath, "support_enrichment.r"))
source(file.path(tt_basepath, "support_networks.r"))

process_carnival <- function(carnival_res) {
  carnival_res$weightedSIF <- carnival_res$weightedSIF %>% dplyr::filter(Weight != 0)
  carnival_res
}
msig_h <- msigdbr::msigdbr(organism, "C2", )
pathways <- loadGSC(
  file = dplyr::select(
    msig_h,
    c("gene_symbol", "gs_name")
  ),
  addInfo = dplyr::select(
    msig_h,
    c("gs_name", "gs_description")
  )
)

R Markdown Snakemake From line 48 of RMD_scripts/carnival_downstream.Rmd

carnival_result <- readRDS(carnival_path)
if (length(carnival_result) == 0) {
  write("CARNIVAL FAILED", file = out_file)
  quit(save = "no", status = 0)
}

carnival_result <- process_carnival(carnival_result)
nodes_carnival <- extractCARNIVALnodes(carnival_result)

sig_pathways <- runGSAhyper(
  genes = nodes_carnival$sucesses,
  universe = nodes_carnival$bg, gsc = pathways
)
sig_pathways_df <- as.data.frame(sig_pathways$resTab) %>%
  tibble::rownames_to_column(var = "pathway")

pathways_select <- sig_pathways_df %>%
  dplyr::select(pathway, `p-value`, `Adjusted p-value`) %>%
  dplyr::filter(`Adjusted p-value` <= 0.05) %>% # Signficance value fixed for now
  dplyr::rename(pvalue = `p-value`, AdjPvalu = `Adjusted p-value`) %>%
  dplyr::mutate(pathway = as.factor(pathway))


pathways_select <- data.frame(t(apply(pathways_select, 1, function(r) {
  aux <- unlist(strsplit(sub("_", ";", r["pathway"]), ";"))
  r["pathway"] <- gsub("_", " ", aux[2])
  return(c(r, "source" = aux[1]))
})))

if (ncol(pathways_select) == 4) {
  colnames(pathways_select) <- c("pathway", "pvalue", "AdjPvalu", "source")
  pathways_select$AdjPvalu <- as.numeric(pathways_select$AdjPvalu)

  ggdata <- pathways_select %>%
    dplyr::slice_min(AdjPvalu, n = 25) %>%
    dplyr::filter(AdjPvalu <= 0.05) %>%
    dplyr::group_by(source) %>%
    dplyr::arrange(AdjPvalu) %>%
    dplyr::slice(1:5)

  # Visualize top results
  ggplot(ggdata, aes(
    y = reorder(pathway, -log10(AdjPvalu)),
    x = -log10(AdjPvalu)
  ), color = source) +
    facet_grid(source ~ ., scales = "free", space = "free") +
    geom_bar(stat = "identity") +
    annotation_logticks(sides = "bt") +
    theme_bw() +
    theme(
      axis.title = element_text(face = "bold", size = 12),
      axis.text.y = element_text(size = 6)
    ) +
    ylab("")
}

R Markdown ggplot2 From line 98 of RMD_scripts/carnival_downstream.Rmd

plot_carn_network <- function(carn_res) {
  ed <- carn_res$weightedSIF
  colnames(ed) <- c("from", "sign", "to", "weight")
  nod <- data.frame(union(ed$from, ed$to))

  colnames(nod) <- "nodes"
  nod$label <- nod$nodes
  joined_nod <- left_join(nod, carn_res$nodesAttributes,
    by = c("label" = "Node")
  )
  yote <- tidygraph::tbl_graph(
    nodes = joined_nod,
    edges = ed
  ) %>%
    ggraph(layout = "auto") +
    geom_node_point(aes(color = AvgAct), size = 8) + scale_color_gradient2() +
    geom_edge_link(arrow = arrow(), aes(
      edge_colour = as.factor(sign),
      edge_alpha = weight
    )) +
    theme_graph() + geom_node_text(aes(label = label), vjust = 0.4, repel = T)
  yote
}

plot_carn_network(carnival_result) + ggtitle(comp_group)

R Markdown From line 161 of RMD_scripts/carnival_downstream.Rmd

options(error = traceback, echo = FALSE)
library(readr)
library(piano)
library(dplyr)
library(ggplot2)
library(tibble)
library(tidyr)
library(dplyr)
library(scales)
library(plyr)
library(GSEABase)
library(network)
library(reshape2)
library(cowplot)
library(pheatmap)
library(ggraph)
library(tidygraph)
library(ComplexHeatmap)
library(RNAscripts)
library(UpSetR)
library(knitr)

R Markdown ggplot2 dplyr tidyr readr cowplot reshape2 tibble knitr scales plyr ComplexHeatmap UpSetR ggraph GSEABase tidygraph network PIANO From line 9 of RMD_scripts/carnival_join.Rmd

if (exists("snakemake")) {
  carnival_paths <- snakemake@input[["carnival_objs"]] %>% as.character()
  cond_id <- snakemake@wildcards[["condition"]]
  names(carnival_paths) <- names(snakemake@config[["diffexp"]][["contrasts"]][[cond_id]])
  contrast_names <- names(snakemake@config[["diffexp"]][["contrasts"]][[cond_id]])
  write(carnival_paths, file = stderr())
  write(names(carnival_paths), file = stderr())
  tt_basepath <- snakemake@params[["tutorial_source_path"]]
  # Output Files
  outpath <- snakemake@output[[1]]
  organism <- snakemake@config[["organism"]]
} else {
  # BASE_ANALYSIS_DIR = '/omics/odcf/analysis/OE0228_projects/VascularAging/rna_sequencing/Vasc_age2020/'
  cond_id <- "condition"
  the_yaml <- file.path("../../../configs/VascAge_Apelin_config.yaml")

  yaml_me <- yaml::read_yaml(the_yaml)
  BASE_ANALYSIS_DIR <- yaml_me$dirs$BASE_ANALYSIS_DIR
  carnival_paths <- file.path(
    BASE_ANALYSIS_DIR, "results/carnival/condition",
    paste0(names(yaml_me$diffexp$contrasts$condition), "_carnival_res.RDS.gz")
  )
  names(carnival_paths) <- names(yaml_me$diffexp$contrasts$condition)
  contrast_names <- names(yaml_me$diffexp$contrasts$condition)
  base_path <- ""
  tt_basepath <- "../transcriptutorial"
  organism <- "Mus musculus"
}
source(file.path(tt_basepath, "support_enrichment.r"))
source(file.path(tt_basepath, "support_networks.r"))

## Get Omnipath
organism_number <- RNAscripts::get_organism_omnipath_id(organism)
omniR <- OmnipathR::import_omnipath_interactions(organism = organism_number)

# signed and directed
omnipath_sd <- omniR %>% dplyr::filter(consensus_direction == 1 &
  (consensus_stimulation == 1 |
    consensus_inhibition == 1
  ))

# changing 0/1 criteria in consensus_stimulation/inhibition to -1/1
omnipath_sd$consensus_stimulation[which(omnipath_sd$consensus_stimulation == 0)] <- -1
omnipath_sd$consensus_inhibition[which(omnipath_sd$consensus_inhibition == 1)] <- -1
omnipath_sd$consensus_inhibition[which(omnipath_sd$consensus_inhibition == 0)] <- 1

# check consistency on consensus sign and select only those in a SIF format
sif <- omnipath_sd[, c("source_genesymbol", "consensus_stimulation",
                       "consensus_inhibition", "target_genesymbol")] %>%
  dplyr::filter(consensus_stimulation == consensus_inhibition) %>%
  unique.data.frame()

sif$consensus_stimulation <- NULL
colnames(sif) <- c("source", "interaction", "target")

# remove complexes
sif$source <- gsub(":", "_", sif$source)
sif$target <- gsub(":", "_", sif$target)

# dorothea for CARNIVAL
carnival_sample_resolution <- purrr::map(carnival_paths, read_rds)
for (i in seq_along(length(carnival_sample_resolution))) {
  if (length(carnival_sample_resolution[[i]]) == 0) {
    carnival_sample_resolution[[i]] <- NULL
  }
}

if (length(carnival_sample_resolution) == 0) {
  write("Carnvival unsuccessful", file = stderr())
  readr::write_tsv(data.frame(), file = outpath)
  quit(save = "no", status = 0, runLast = FALSE)
}


msig_h <- msigdbr::msigdbr("Mus musculus", "C2", )


pathways <- loadGSC(
  file = dplyr::select(msig_h, c("gene_symbol", "gs_name")),
  addInfo = dplyr::select(msig_h, c("gs_name", "gs_description"))
)
# nodes_carnival <- extractCARNIVALnodes(carnival_sample_resolution[[1]])

R Markdown Snakemake Consensus From line 39 of RMD_scripts/carnival_join.Rmd

carnival_sample_resolution <- purrr::map(
  carnival_sample_resolution,
  process_carnival
)
carnival_sample_resolution <- carnival_sample_resolution[purrr::map(carnival_sample_resolution, length) > 0]

if (length(carnival_sample_resolution) == 0) {
  knitr::knit_exit(append = "All sample carnival runs failed. Aborting...")
}

R Markdown From line 134 of RMD_scripts/carnival_join.Rmd

# get only summary files from CARNIVAL results
sifts <- lapply(carnival_sample_resolution, function(x) {
  x$weightedSIF
})
nodos <- lapply(carnival_sample_resolution, function(x) {
  x$nodesAttributes
})
write(names(sifts), file = stderr())
# Calculate the number of edges and nodes in the networks and its density
node_edge <- do.call(rbind, lapply(sifts, count_edges_nodes_degree))

# Calculate degree distribution for a sample
count_degree <- sifts[[1]] %>% degree_count()

# degree distribution
p <- data.frame(table(count_degree$total_count) / nrow(count_degree))
colnames(p) <- c("Var1", "total_degree")
p <- merge.data.frame(p, data.frame(table(count_degree$in_count) / nrow(count_degree)), all = T)
colnames(p) <- c("Var1", "total_degree", "in_degree")
p <- merge.data.frame(p, data.frame(table(count_degree$out_count) / nrow(count_degree)), all = T)
colnames(p) <- c("k", "total_degree", "in_degree", "out_degree")
p <- melt(p, value.name = "p", id.vars = "k")
p$k <- relevel(p$k, "0")

# visualise
ggdat <- as.data.frame(node_edge) %>%
  tibble::rownames_to_column(var = "sample") %>%
  dplyr::mutate(condition = gsub(".Rep[0-9]{1}", "", sample))

# Plotting

# relation between number of edges and nodes
ggplot(ggdat, aes(x = nodes, y = edges, color = as.factor(condition))) +
  geom_point() +
  geom_text(
    label = ggdat$sample,
    check_overlap = TRUE,
    vjust = 0,
    nudge_y = 0.5,
    show.legend = F
  ) +
  theme_bw(base_size = 15) +
  guides(color = guide_legend(title = "Conditions")) +
  ggtitle("Node-edge composition")

R Markdown ggplot2 From line 146 of RMD_scripts/carnival_join.Rmd

# network degree
ggplot(ggdat, aes(x = density, y = sample, fill = as.factor(condition))) +
  geom_col() +
  theme_bw(base_size = 15) +
  guides(fill = guide_legend(title = "Conditions")) +
  ggtitle("Network degree")

R Markdown ggplot2 From line 197 of RMD_scripts/carnival_join.Rmd

# degree distribution
levels(p$k) <- levels(p$k) %>%
  as.numeric() %>%
  sort()
dd <- ggplot(data = p, aes(x = k, y = p, group = variable, color = variable)) +
  geom_point() +
  geom_line() +
  theme_bw() +
  theme(legend.position = "none") +
  guides(color = guide_legend(title = "degree type")) +
  ggtitle("Degree distribution")

ddp <- ggplot(data = p, aes(x = as.numeric(k), y = p, 
                            group = variable, color = variable)) +
  geom_point() +
  geom_line() +
  scale_x_continuous(
    breaks = as.numeric(p$k),
    trans = scales::log_trans()
  ) +
  scale_y_log10(
    breaks = trans_breaks("log10", function(x) 10^x),
    labels = trans_format("log10", math_format(10^.x))
  ) +
  annotation_logticks() +
  theme_bw() +
  guides(color = guide_legend(title = "degree type")) +
  ggtitle("Degree distribution (log scale)") +
  xlab("k (ln)") +
  ylab("p (log10)")

plot_grid(dd, ddp, labels = "auto", rel_widths = c(1, 2))

R Markdown ggplot2 From line 209 of RMD_scripts/carnival_join.Rmd

# create a matrix of all interactions for all samples
write(dim(sif), file = stderr())
interactions <- getTopology(networks = sifts, scafoldNET = sif)
colnames(interactions) <- names(carnival_sample_resolution)
# FIxes bug in topology function (To lazy to actually fix)
ncol_interact <- ncol(interactions)
# interactions <- interactions[,-c(1:ncol_interact/2)]
# get the edges per sample
net_int <- apply(interactions, 2, function(x, r) {
  r[which(!is.na(x))]
}, rownames(interactions))

# calculate Jaccard indexes per pair
combined <- expand.grid(1:length(names(sifts)), 1:length(names(sifts)))
jac_index <- matrix(
  data = NA, nrow = length(names(sifts)), ncol = length(names(sifts)),
  dimnames = list(names(sifts), names(sifts))
)

for (i in 1:nrow(combined)) {
  n <- names(sifts)[combined[i, 1]]
  m <- names(sifts)[combined[i, 2]]
  jac_index[n, m] <- length(intersect(net_int[[n]], net_int[[m]])) / length(union(net_int[[n]], net_int[[m]]))
}

# Visualize the indexes in a heatmap

pheatmap::pheatmap(jac_index,
  fontsize = 14,
  fontsize_row = 10, fontsize_col = 10,
  angle_col = 45, treeheight_col = 0
)
corrplot::corrplot(jac_index, )

R Markdown From line 252 of RMD_scripts/carnival_join.Rmd

# Get common interactions in a group

get_common_interactions <- function(interaction_list, samples, psmpl_per = 95,
                                    carnival_list = NULL) {
  stopifnot(all(samples %in% colnames(interaction_list)))
  interaction_list <- interaction_list[, samples]
  if (!is.null(carnival_list)) {
    nodos <- purrr::map_df(carnival_list[samples], function(carn_res) {
      carn_res$nodesAttributes %>% pull(AvgAct)
    })
    nodos$nodes <- carnival_list[[1]]$nodesAttributes$Node
  }

  shared_interactions_WT <- NULL
  while (is.null(shared_interactions_WT)) {
    if (psmpl_per <= 0) {
      break
    }
    psmpl_per <- psmpl_per - 5
    shared_interactions_WT <- getCoreInteractions(topology = interaction_list,
                                                  psmpl = psmpl_per)
  }

  # Visualise the interactions
  if (!is.null(shared_interactions_WT)) {
    colnames(shared_interactions_WT) <- c("from", "sign", "to")
    labels_edge <- c("-1" = "inhibition", "1" = "activation")
    nodes <- data.frame(union(shared_interactions_WT$from,
                              shared_interactions_WT$to))
    colnames(nodes) <- "nodes"
    nodes$label <- nodes$nodes
    if (!is.null(carnival_list)) {
      nodes <- dplyr::inner_join(nodes, nodos) %>%
        mutate(sd = matrixStats::rowSds(as.matrix(.[samples])))
    } else {
      nodes$sd <- rep(1, nrow(nodes))
    }
    p <- tidygraph::tbl_graph(nodes = nodes, edges = shared_interactions_WT) %>%
      ggraph::ggraph(layout = "auto") +
      geom_node_point(aes(color = sd), size = 8) +
      geom_edge_link(arrow = arrow(), aes(edge_colour = as.factor(sign))) +
      theme_graph() + scale_color_viridis(option = "E") +
      geom_node_text(aes(label = label), nudge_y = 0.04) +
      labs(caption = glue::glue("Overlap in {psmpl_per} percent of groups"))
  } else {
    p <- NULL
  }
  p
}

get_common_interactions(interactions[], names(interactions), 100, carnival_list = carnival_sample_resolution) + ggtitle("Between all groups")

# get_common_interactions(interactions[,c(6,7)], 100) + ggtitle("Apelin Treatment common interactions")

R Markdown From line 296 of RMD_scripts/carnival_join.Rmd

# Get common interactions in a group
create_overlap_matrix <- function(sig_gene_names) {
  gene_name_index <- unlist(sig_gene_names) %>%
    unique() %>%
    sort()

  #' Takes two character vecotrs (info and index) and checks which index
  #' fields are present in info
  #'
  #' @return returns a vecor of length(index) with 1 or 0 depending on
  #'  True or False
  check_against_index <- function(info, index) {
    return(as.numeric(index %in% info))
  }

  gene_matrix <- sapply(sig_gene_names, check_against_index, index = gene_name_index)
  rownames(gene_matrix) <- gene_name_index

  row_sum <- rowSums(gene_matrix)

  gene_matrix <- cbind(gene_matrix, total = rowSums(gene_matrix))

  ordered_matrix <- gene_matrix[order(gene_matrix[, "total"], decreasing = T), ]

  DT::datatable(ordered_matrix)
}
build_comb_gene_set <- function(comb_mat, comb_mat_order) {
  combination_gene_sets <- list()
  for (c_name in ComplexHeatmap::comb_name(comb_mat)) {
    # c_name <- comb_name(comb_mat)[1]
    gnames <- ComplexHeatmap::extract_comb(m = comb_mat, comb_name = c_name)

    groups_index <- stringr::str_split(c_name, "") %>% unlist()
    group_names <- ComplexHeatmap::set_name(comb_mat)[as.logical(as.numeric(groups_index))]
    merged_group_name <- paste(group_names, collapse = " + ")


    combination_gene_sets[[merged_group_name]] <- tibble::tibble(gene_names = gnames,
                                                                 lfc = rep(0, length(gnames)))
  }
  if (length(combination_gene_sets) > 30) {
    combination_gene_sets <- combination_gene_sets[c(match(names(comb_mat_order),
                                                           ComplexHeatmap::comb_name(comb_mat))[1:30])]
  } else {
    combination_gene_sets <- combination_gene_sets
  }
  combination_gene_sets
}

gsa_plot <- function(gene_list, background, pathways) {
  sig_pathways <- runGSAhyper(
    genes = gene_list,
    universe = background, gsc = pathways
  )
  sig_pathways_df <- as.data.frame(sig_pathways$resTab) %>%
    tibble::rownames_to_column(var = "pathway")



  PathwaysSelect <- sig_pathways_df %>%
    dplyr::select(pathway, `p-value`, `Adjusted p-value`) %>%
    dplyr::filter(`Adjusted p-value` <= 0.05) %>%
    dplyr::rename(pvalue = `p-value`, AdjPvalu = `Adjusted p-value`) %>%
    dplyr::mutate(pathway = as.factor(pathway))

  PathwaysSelect <- data.frame(t(apply(PathwaysSelect, 1, function(r) {
    aux <- unlist(strsplit(sub("_", ";", r["pathway"]), ";"))
    r["pathway"] <- gsub("_", " ", aux[2])
    return(c(r, "source" = aux[1]))
  })))

  if (ncol(PathwaysSelect) == 4) {
    colnames(PathwaysSelect) <- c("pathway", "pvalue", "AdjPvalu", "source")
    PathwaysSelect$AdjPvalu <- as.numeric(PathwaysSelect$AdjPvalu)

    ggdata <- PathwaysSelect %>%
      dplyr::slice_min(AdjPvalu, n = 25) %>%
      dplyr::filter(AdjPvalu <= 0.05) %>%
      dplyr::group_by(source) %>%
      dplyr::arrange(AdjPvalu) %>%
      dplyr::slice(1:5)


    # Visualize top results
    ggplot(ggdata, aes(y = reorder(pathway, -log10(AdjPvalu)),
                       x = -log10(AdjPvalu)), color = source) +
      facet_grid(source ~ ., scales = "free", space = "free") +
      geom_bar(stat = "identity") +
      annotation_logticks(sides = "bt") +
      theme_bw() +
      theme(
        axis.title = element_text(face = "bold", size = 12),
        axis.text.y = element_text(size = 6)
      ) +
      ylab("")
  }
}
build_gene_sets <- function(node_list) {
  gene_name_set <- ComplexHeatmap::list_to_matrix(node_list)
  comb_mat <- ComplexHeatmap::make_comb_mat(gene_name_set)
  comb_mat_order <- ComplexHeatmap::comb_size(comb_mat) %>% sort(decreasing = T)

  create_overlap_matrix(node_list)
  combination_gene_sets_youndold <- build_comb_gene_set(comb_mat,
                                                        comb_mat_order = comb_mat_order)
  combination_gene_sets_youndold
}
if (all(c(contrast_names) %in% names(interactions))) {
  get_common_interactions(interactions[],
    samples = contrast_names,
    psmpl_per = 95,
    carnival_list = carnival_sample_resolution
  )


  nodes <- purrr::map(contrast_names, function(x) {
    carnival_sample_resolution[[x]]$nodesAttributes %>%
      dplyr::filter(AvgAct != 0)
  })

  node_list <- purrr::map(nodes, pull, Node)
  names(node_list) <- contrast_names
  upset_act <- UpSetR::fromList(node_list)


  UpSetR::upset(upset_act)

  gene_name_set <- ComplexHeatmap::list_to_matrix(node_list)
  comb_mat <- ComplexHeatmap::make_comb_mat(gene_name_set)
  comb_mat_order <- ComplexHeatmap::comb_size(comb_mat) %>% sort(decreasing = T)

  # create_overlap_matrix(node_list)
  combination_gene_sets <- build_comb_gene_set(comb_mat,
                                               comb_mat_order = comb_mat_order)

  create_overlap_matrix(node_list)
}

R Markdown ggplot2 From line 354 of RMD_scripts/carnival_join.Rmd

sample_nodes <- purrr::map(carnival_sample_resolution, function(x) {
  x$nodesAttributes
})

build_matrix_from_nodes <- function(node_list, node_type = "AvgAct") {
  gene_index <- purrr::map(node_list, ~ pull(., Node)) %>%
    unlist() %>%
    unique()
  node_mat <- purrr::map(node_list, ~ dplyr::select(., Node, !!node_type)) %>%
    purrr::reduce(full_join, by = "Node")
  colnames(node_mat) <- c("Node", names(node_list))
  node_mat
}

avg_mat <- build_matrix_from_nodes(sample_nodes) %>% as.data.frame()
rownames(avg_mat) <- avg_mat$Node
avg_mat <- subset(avg_mat, select = -c(Node)) %>% as.matrix()
non_zero_index <- apply(avg_mat, 1, function(x) !all(x == 0))

ComplexHeatmap::Heatmap(avg_mat[non_zero_index, ], column_names_rot = 45,
                        row_names_gp = gpar(fontsize = 6))

R Markdown From line 501 of RMD_scripts/carnival_join.Rmd

knitr::opts_chunk$set(
  echo = FALSE, warning = FALSE, message = FALSE, dev = "png",
  fig.width = 12, fig.height = 12
)
if (!require(dorothea)) {
  BiocManager::install("dorothea")
}
library(dorothea)
library(decoupleR)
library(BiocParallel)
library(dplyr)
library(viper)
library(RColorBrewer)
if (!require(RNAscripts)) {
  devtools::install("./scripts/RNAscripts", upgrade = "never")
}
library(RNAscripts)
library(ggplot2)
library(readr)
library(ComplexHeatmap)
library(DESeq2)
library(purrr)
library(tidyverse)
if (TRUE) {
  write(str(snakemake), file = stderr())
  dds_path <- snakemake@input[["dds_obj"]]
  cond_var <- snakemake@wildcards[["condition"]]
  diffexp_tb_path <- snakemake@input[["table"]]
  fpkm_path <- snakemake@input[["fpkm"]]
  contrast_groups <- snakemake@wildcards[["contrast"]]

  register(MulticoreParam(snakemake@threads))
  contrast_name <- contrast_groups
  plot_path <- snakemake@params[["plot_path"]]
  color_scheme <- snakemake@config[["group_colors"]][[cond_var]]
  organism <- snakemake@config[["organism"]]
} else {
  the_yaml <- yaml::read_yaml("../../../config/STAD.yaml")
  snakedir <- "/Users/heyechri/Documents/software/heyer/multicondition-deseq2-enrichment"
  BASE_ANALYSIS_DIR <- file.path(snakedir, "data/STAD")
  dds_path <- file.path(paste0(BASE_ANALYSIS_DIR), "deseq2/all.rds")
  cond_var <- names(the_yaml$diffexp$contrasts)[2]
  diffexp_tb_path <- file.path(
    paste0(BASE_ANALYSIS_DIR),
    glue::glue("results/diffexp/{cond_var}/{names(the_yaml$diffexp$contrasts[[cond_var]])[1]}.diffexp.tsv")
  )
  fpkm_path <- file.path(BASE_ANALYSIS_DIR, "fpkm/all.tsv")
  contrast_groups <- the_yaml$diffexp$contrasts[[cond_var]][1]
  plot_path <- "."
  register(SerialParam())
  # s_groups<- c("d0-lung", "d15-lung", "d22-lung", "d36-lung", "18m-lung")
  contrast_name <- glue::glue("{contrast_groups[1]} vs {contrast_groups[2]}")

  comp_groups <- the_yaml$comp_groups
  color_scheme <- the_yaml$group_colors[[cond_var]]
  organism <- "Homo sapiens"
}
dir.create(plot_path, recursive = T)

R Markdown ggplot2 tidyverse dplyr readr RColorBrewer purrr ComplexHeatmap viper decoupleR From line 9 of RMD_scripts/dorothea_diffexp.Rmd

dds_obj <- readRDS(dds_path)
diffexp_tb <- read_tsv(diffexp_tb_path,
  col_names = c(
    "gene_id",
    "baseMean",
    "logFoldChange",
    "lfcSE", "stat",
    "pvalue", "padj"
  ),
  skip = 1
)
Normalized_counts <- getVarianceStabilizedData(dds_obj)
# Normalized_counts<- assay(vst(dds_obj,blind = FALSE))
fpkm <- read_tsv(fpkm_path)
filer <- fpkm %>%
  dplyr::filter(gene %in% rownames(Normalized_counts)) %>%
  dplyr::filter(!duplicated(gname))

joined_df <- join_tables(diffexp_tb, filer) %>% dplyr::filter(!duplicated(gname))
Normalized_counts <- Normalized_counts[filer$gene, ]

rownames(Normalized_counts) <- filer$gname

joined_df %>%
  dplyr::select(gname, stat) %>%
  dplyr::filter(!is.na(stat)) %>%
  tibble::column_to_rownames(var = "gname") %>%
  as.matrix() -> diffexp_matrix
regulons <- dorothea_mm %>% filter(confidence %in% c("A", "B", "C"))

R Markdown From line 76 of RMD_scripts/dorothea_diffexp.Rmd

if (organism == "Mus musculus") {
  org_name <- "mouse"
} else if (organism == "Homo sapiens") {
  org_name <- "human"
} else {
  stop("Organism not supported. Please set Mus musculus or Homo sapiens in the config")
}
net <- get_dorothea(organism = org_name, levels = c("A", "B", "C"))
deg <- joined_df %>%
  dplyr::select(gname, logFoldChange, stat, padj) %>%
  dplyr::filter(!is.na(stat)) %>%
  mutate(padj = tidyr::replace_na(padj, 1)) %>%
  tibble::column_to_rownames(var = "gname") %>%
  as.matrix() -> diffexp_matrix
design <- dds_obj@colData
sample_acts <- run_wmean(
  mat = Normalized_counts, network = net, .source = "source", .target = "target",
  .mor = "mor", times = 1000, minsize = 2
)
n_tfs <- 30

# Transform to wide matrix
sample_acts_mat <- sample_acts %>%
  filter(statistic == "norm_wmean") %>%
  pivot_wider(
    id_cols = "condition", names_from = "source",
    values_from = "score"
  ) %>%
  tibble::column_to_rownames("condition") %>%
  as.matrix()

# Get top tfs with more variable means across clusters
tfs <- sample_acts %>%
  group_by(source) %>%
  dplyr::summarise(std = sd(score)) %>%
  arrange(-abs(std)) %>%
  head(n_tfs, n = 30) %>%
  pull(source)
sample_acts_mat <- sample_acts_mat[, tfs]

# Scale per sample
sample_acts_mat <- scale(sample_acts_mat, scale = F)

# Choose color palette
palette_length <- 100
my_color <- colorRampPalette(c("Darkblue", "white", "red"))(palette_length)

my_breaks <- c(
  seq(-3, 0, length.out = ceiling(palette_length / 2) + 1),
  seq(0.05, 3, length.out = floor(palette_length / 2))
)

if (!is.null(color_scheme)) {
  ha <- rowAnnotation(
    group = as.character(dds_obj@colData[, cond_var]),
    col = list(
      group =
        as_vector(color_scheme)
    )
  )
} else {
  ha <- rowAnnotation(group = as.character(dds_obj@colData[, cond_var]))
}


# Plot
ComplexHeatmap::Heatmap(sample_acts_mat,
  clustering_method_rows = "average",
  clustering_method_columns = "average",
  right_annotation = ha,
  heatmap_legend_param = list(title = "score")
)

R Markdown From line 108 of RMD_scripts/dorothea_diffexp.Rmd

contrast_acts <- run_wmean(
  mat = deg[, "stat", drop = FALSE],
  net = net, .source = "source", .target = "target",
  .mor = "mor", times = 1000, minsize = 3
)
contrast_acts
f_contrast_acts <- contrast_acts %>%
  filter(statistic == "norm_wmean") %>%
  mutate(rnk = NA)

# Filter top TFs in both signs
msk <- f_contrast_acts$score > 0
f_contrast_acts[msk, "rnk"] <- rank(-f_contrast_acts[msk, "score"])
f_contrast_acts[!msk, "rnk"] <- rank(-abs(f_contrast_acts[!msk, "score"]))
tfs <- f_contrast_acts %>%
  arrange(rnk) %>%
  head(n_tfs) %>%
  pull(source)
f_contrast_acts <- f_contrast_acts %>%
  filter(source %in% tfs)

# Plot
ggplot(f_contrast_acts, aes(x = reorder(source, score), y = score)) +
  geom_bar(aes(fill = score), stat = "identity") +
  scale_fill_gradient2(
    low = "darkblue", high = "indianred",
    mid = "whitesmoke", midpoint = 0
  ) +
  theme_minimal() +
  theme(
    axis.title = element_text(face = "bold", size = 12),
    axis.text.x =
      element_text(angle = 45, hjust = 1, size = 10, face = "bold"),
    axis.text.y = element_text(size = 10, face = "bold"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  xlab("TFs")

R Markdown ggplot2 From line 183 of RMD_scripts/dorothea_diffexp.Rmd

viper_net <- net
colnames(viper_net) <- colnames(dorothea_mm)
tf_activities_stat <- dorothea::run_viper(diffexp_matrix, viper_net,
  options = list(
    minsize = 5, eset.filter = FALSE,
    cores = 1, verbose = F, nes = TRUE
  )
)

tf_activities_stat_top25 <- tf_activities_stat %>%
  as.data.frame() %>%
  rownames_to_column(var = "GeneID") %>%
  dplyr::rename(NES = "stat") %>%
  dplyr::top_n(25, wt = abs(NES)) %>%
  dplyr::arrange(NES) %>%
  dplyr::mutate(GeneID = factor(GeneID))

NES_plot <- ggplot(tf_activities_stat_top25, aes(x = reorder(GeneID, NES), y = NES)) +
  geom_bar(aes(fill = NES), stat = "identity") +
  scale_fill_gradient2(
    low = "darkblue", high = "indianred",
    mid = "whitesmoke", midpoint = 0
  ) +
  theme_minimal() +
  theme(
    axis.title = element_text(face = "bold", size = 12),
    axis.text.x =
      element_text(angle = 45, hjust = 1, size = 10, face = "bold"),
    axis.text.y = element_text(size = 10, face = "bold"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  xlab("Transcription Factors") +
  ggtitle(contrast_name)
ggsave(filename = file.path(plot_path, glue::glue("{contrast_name}_NES_plot.svg")), NES_plot)
NES_plot

R Markdown ggplot2 From line 238 of RMD_scripts/dorothea_diffexp.Rmd

tf_activities_counts <-
  dorothea::run_viper(Normalized_counts, regulons,
    options = list(
      minsize = 5, eset.filter = FALSE,
      cores = 1, verbose = FALSE, method = c("scale")
    )
  )

tf_activities_counts_filter <- tf_activities_counts %>%
  as.data.frame() %>%
  rownames_to_column(var = "GeneID") %>%
  dplyr::filter(GeneID %in% tf_activities_stat_top25$GeneID) %>%
  column_to_rownames(var = "GeneID") %>%
  as.matrix()
tf_activities_vector <- as.vector(tf_activities_counts_filter)

paletteLength <- 100

ha <- HeatmapAnnotation(
  group = dds_obj@colData$condition,
  col = list(
    group =
      as_vector(color_scheme)
  )
)

dorothea_hmap <- ComplexHeatmap::Heatmap(tf_activities_counts_filter,
  name = glue::glue("NES"),
  top_annotation = ha,
  show_column_names = F
)

save_cheatmap_svg(x = dorothea_hmap, filename = file.path(plot_path, glue::glue("{contrast_name}_dorothea.svg")))
dorothea_hmap

R Markdown From line 277 of RMD_scripts/dorothea_diffexp.Rmd

tf_activities_CARNIVALinput <- tf_activities_stat %>%
  as.data.frame() %>%
  tibble::rownames_to_column(var = "TF")
write_csv(tf_activities_CARNIVALinput, "../results/TFActivity_CARNIVALinput.csv")

R Markdown From line 315 of RMD_scripts/dorothea_diffexp.Rmd

knitr::opts_chunk$set(
  echo = FALSE, warning = FALSE, message = FALSE, dev = "png",
  fig.width = 12, fig.height = 12
)
library(purrr)
library(clusterProfiler)
library(magrittr)
library(ggplot2)
library(org.Mm.eg.db)
library(svglite)
if (exists("snakemake")) {
  gsea_list <- snakemake@input[["gsea_result"]]
  plot_path <- snakemake@params[["plot_path"]]
  c_groups <- snakemake@params[["contrast_groups"]]
  write(unlist(c_groups), file = stderr())
  names(gsea_list) <- names(x = c_groups)
  species_name <- snakemake@config[["organism"]]
  gene_name_type <- snakemake@config[["gene_name_type"]]
  gsea_conf <- snakemake@config[["gsea"]]
  save_plots <- TRUE
} else {
  conf <- yaml::read_yaml("../../../configs/VascAge_Apelin_config.yaml")
  BASE_DIR <- file.path(conf$dirs$BASE_ANALYSIS_DIR)
  cond_id <- names(conf$diffexp$contrasts)[1]
  gsea_list <- as.list(file.path(BASE_DIR, glue::glue("results/diffexp/{cond_id}/{names(conf$diffexp$contrasts[[cond_id]])}.gseares.RDS")))
  c_groups <- names(conf$diffexp$contrasts[[cond_id]])
  names(gsea_list) <- c_groups
  #  gsea_list <- gsea_list[1:3]
  plot_path <- "/home/heyer/tmp/"
  species_name <- "Mus musculus"
  gene_name_type <- "ENSEMBL"
  save_plots <- FALSE
  gsea_conf <- conf$gsea
}
limit <-  10

R Markdown Snakemake ggplot2 magrittr org.Mm.eg.db purrr clusterProfiler svglite From line 11 of RMD_scripts/gsea_report.Rmd

create_all_gsea_plots <- function(msig_list, contrast_n, limit = 30, plot_type, save_plots = F) {
  if (is.list(msig_list)) {
    msig_list <- msig_list[[1]]
  }
  if (nrow(msig_list) < limit) {
    sig_ids <- msig_list@result$ID
  } else {
    sig_ids <- msig_list@result$ID[1:limit]
  }
  all_plots <- purrr::map(sig_ids, function(gset_id) {
    p <- enrichplot::gseaplot2(
      x = msig_list,
      geneSetID = gset_id,
      title = msig_list[gset_id, ]$Description
    )
    aplot::as.patchwork(p)
  })
  names(all_plots) <- sig_ids
  if (save_plots) {
    gsea_plot_path <- glue::glue("{plot_path}/{contrast_n}/{plot_type}", recursive = T)
    dir.create(gsea_plot_path, recursive = T)
    purrr::map2(all_plots, sig_ids, function(p, n) {
      ggsave(plot = p, filename = file.path(gsea_plot_path, paste0(n, "_gseaplot.svgz")), device = svglite)
    })
  }
  all_plots
}
gsea_result <- purrr::map(gsea_list, readRDS)

R Markdown From line 55 of RMD_scripts/gsea_report.Rmd

build_gene_set_detail <- function(gset_res, gset_name, inkey) {
  ensembl_names <- gset_res@result[gset_name, ]$core_enrichment %>%
    stringr::str_split("/") %>%
    unlist()


  all_geneList <- gset_res@geneList %>% tibble::as_tibble()
  all_geneList[inkey] <- as.character(names(gset_res@geneList))
  all_geneList <- all_geneList %>%
    dplyr::filter(!!rlang::sym(inkey) %in% gset_res@geneSets[[gset_name]]) %>%
    mutate(core_enrichment = ifelse(!!rlang::sym(inkey) %in% ensembl_names, "YES", "NO"))
  org_db <- RNAscripts::get_org_db(species_name)
  t_table <- clusterProfiler::bitr(all_geneList %>% dplyr::pull(inkey), inkey, "SYMBOL", OrgDb = org_db)

  yarp <- dplyr::inner_join(all_geneList, t_table) %>%
    dplyr::relocate(SYMBOL) %>%
    dplyr::arrange(dplyr::desc(value))
  yarp
}


create_msig_entries <- function(gsea_res, msig_type, ktype, limit = 20) {
  stopifnot(!is.null(names(gsea_res)))
  for (set_n in names(gsea_res)) {
    cat("#### ", set_n, " {.tabset} \n")


    yote <- gsea_res[[set_n]][[msig_type]]
    if (is.null(yote)) {
      next
    }
    if (is.list(yote)) {
      yote <- yote[[1]]
    }
    org_db <- RNAscripts::get_org_db(species_name)
    if (is(gsea_result$basal_pos_vs_bas_neg$msig_gsea, "gseaResult")) {
      yote <- setReadable(yote, org_db, keyType = ktype)
    }

    msig_res <- gsea_res[[set_n]][[msig_type]]
    if (is.list(msig_res)) {
      msig_res <- msig_res[[1]]
    }
    all_plots <- create_all_gsea_plots(msig_res,
      plot_type = msig_type,
      contrast_n = set_n,
      limit = limit, save_plots = save_plots
    )

    if (!is.null(msig_res)) {
      if (nrow(msig_res@result) < limit) {
        names(all_plots) <- msig_res@result$ID
      } else {
        names(all_plots) <- msig_res@result$ID[1:limit]
      }
    }
    print(knitr::kable(yote@result[, -c(1, length(yote@result))] %>% arrange(desc(NES))))

    cat("\n")
    for (sig_gset in names(all_plots)) {
      cat("##### ", sig_gset, " \n")

      print(all_plots[[sig_gset]])

      cat("\n")
      gene_enrich_table <- build_gene_set_detail(msig_res,
        gset_name = sig_gset,
        inkey = ktype
      )
      print(knitr::kable(gene_enrich_table))
      cat("\n")
    }
    cat("\n")

    cat("\n \n")
  }
  cat("\n")

}

R Markdown From line 105 of RMD_scripts/gsea_report.Rmd

for (gsea_name in names(gsea_conf)) {
  if (gsea_conf[[gsea_name]]$use_gsea) {
      cat("\n")
      cat("### ", gsea_name , " {.tabset} \n")
      cat("\n")

      create_msig_entries(gsea_result, msig_type = gsea_name,
                          ktype = gsea_conf[[gsea_name]][["id_class"]],
                          limit = limit)
      cat("\n")
      cat("\n")
      gc()
  }
}

R Markdown From line 190 of RMD_scripts/gsea_report.Rmd

knitr::opts_chunk$set(
  echo = FALSE, warning = FALSE, message = FALSE,
  fig.width = 12, fig.height = 12
)
library(progeny)
library(DESeq2)
library(BiocParallel)
library(readr)
library(magrittr)
library(tidyverse)
library(RColorBrewer)
library(ggplot2)
library(decoupleR)
library(ComplexHeatmap)
if (!require(RNAscripts)) {
  devtools::install("./scripts/RNAscripts", upgrade = "never")
}
library(RNAscripts)
if (exists("snakemake")) {
  dds_path <- snakemake@input[["dds_obj"]]
  diffexp_tb_path <- snakemake@input[["table"]]
  fpkm_path <- snakemake@input[["fpkm"]]
  cond_id <- snakemake@wildcards[["condition"]]
  contrast_groups <- snakemake@wildcards[["contrast"]]
  s_groups <- snakemake@params[["s_groups"]]
  register(MulticoreParam(snakemake@threads))
  contrast_name <- contrast_groups
  plot_path <- snakemake@params[["plot_path"]]
  comp_groups <- snakemake@config[["comp_groups"]]
  color_scheme <- snakemake@config[["group_colors"]][[cond_id]]
  organism <- snakemake@config[["organism"]]
} else {
  the_yaml <- yaml::read_yaml("../../../config/STAD.yaml")
  snakedir <- "/Users/heyechri/Documents/software/heyer/multicondition-deseq2-enrichment"
  BASE_ANALYSIS_DIR <- file.path(snakedir, "data/STAD")
  cond_id <- "SARIFA_Subtype"
  dds_path <- file.path(paste0(BASE_ANALYSIS_DIR), "deseq2/all.rds")
  comp_id <- names(the_yaml$diffexp$contrasts)[1]
  diffexp_tb_path <- file.path(
    paste0(BASE_ANALYSIS_DIR),
    glue::glue("results/diffexp/{cond_id}/{names(the_yaml$diffexp$contrasts[[cond_id]])[1]}.diffexp.tsv")
  )
  fpkm_path <- file.path(BASE_ANALYSIS_DIR, "fpkm/all.tsv")
  contrast_groups <- the_yaml$diffexp$contrasts[[cond_id]][1]
  plot_path <- "."
  register(SerialParam())
  s_groups <- names(the_yaml$group_colors[[cond_id]])
  # s_groups<- c("d0-lung", "d15-lung", "d22-lung", "d36-lung", "18m-lung")
  contrast_name <- glue::glue("{contrast_groups[1]} vs {contrast_groups[2]}")

  comp_groups <- the_yaml$comp_groups
  color_scheme <- the_yaml$group_colors[[cond_id]]
  organism <- "Homo sapiens"
}
dir.create(plot_path, recursive = T)

R Markdown Snakemake ggplot2 tidyverse readr magrittr RColorBrewer ComplexHeatmap decoupleR progeny From line 9 of RMD_scripts/progeny_analysis.Rmd

dds_obj <- readRDS(dds_path)
diffexp_tb <- read_tsv(diffexp_tb_path,
  col_names = c(
    "gene_id",
    "baseMean",
    "logFoldChange",
    "lfcSE", "stat",
    "pvalue", "padj"
  ),
  skip = 1
)
# Issue: we cant use vst for a small test dataset but getVarianceStabilzedData Works
Normalized_counts <- getVarianceStabilizedData(dds_obj)
# Normalized_counts<- assay(vst(dds_obj, blind = FALSE))
fpkm <- read_tsv(fpkm_path)
filer <- fpkm %>%
  dplyr::filter(gene %in% rownames(Normalized_counts)) %>%
  dplyr::filter(!duplicated(gname))

joined_df <- join_tables(diffexp_tb, filer) %>% dplyr::filter(!duplicated(gname))
Normalized_counts <- Normalized_counts[filer$gene, ]

rownames(Normalized_counts) <- filer$gname

joined_df %>%
  dplyr::select(gname, stat) %>%
  dplyr::filter(!is.na(stat)) %>%
  column_to_rownames(var = "gname") %>%
  as.matrix() -> diffexp_matrix
write("finished parsing", file = stderr())

R Markdown From line 72 of RMD_scripts/progeny_analysis.Rmd

org_name <- get_organism_omnipath_name(organism)

prog_net <- decoupleR::get_progeny(organism = org_name, top = 100)


PathwayActivity_counts <- progeny(Normalized_counts, scale = T, organism = org_name, top = 1000, verbose = F, perm = 1)
# Test case has some nans in progeny, these are set to 0
PathwayActivity_counts[is.nan(PathwayActivity_counts)] <- 0

Activity_counts <- as.vector(PathwayActivity_counts)


ann_col <- colData(dds_obj) %>%
  as_tibble() %>%
  pull(!!sym(cond_id)) %>%
  fct_relevel(names(color_scheme))
if (!is.null(color_scheme)) {
  top_anno <- ComplexHeatmap::HeatmapAnnotation(
    sample = ann_col,
    col = list("sample" = as_vector(color_scheme))
  )
} else {
  top_anno <- ComplexHeatmap::HeatmapAnnotation(sample = ann_col)
}
progeny_hmap <- ComplexHeatmap::Heatmap(t(PathwayActivity_counts),
  top_annotation = top_anno,
  clustering_distance_columns = "euclidean",
  clustering_method_columns = "average",
  show_column_names = F, name =
  )
save_cheatmap_svg(x = progeny_hmap, filename = file.path(plot_path, "progeny_hmap.svg"))
progeny_hmap

R Markdown From line 110 of RMD_scripts/progeny_analysis.Rmd

library(broom)
as_tibble(PathwayActivity_counts) %>% t() -> transposed_pact

kruskal_res <- apply(transposed_pact, 1, kruskal.test, g = as.factor(ann_col))

kruskal_test <- purrr::map_df(kruskal_res, tidy, .id = "pathway") %>%
  mutate(padj = p.adjust(p.value, method = "BH")) %>%
  arrange(desc(statistic))

R Markdown broom From line 145 of RMD_scripts/progeny_analysis.Rmd

plot_activity <- as_tibble(PathwayActivity_counts) %>% add_column(condition = fct_relevel(ann_col, s_groups))

for (pathway in kruskal_test$pathway) {
  pval <- kruskal_test %>%
    dplyr::filter(pathway == !!pathway) %>%
    pull(padj)
  p <- ggplot(plot_activity, aes(x = condition, y = !!sym(pathway), fill = condition)) +
    geom_boxplot(outlier.shape = NA, color = "black") +
    geom_jitter(width = 0.1) +
    theme_bw() +
    geom_hline(yintercept = 0) +
    ggtitle(pathway) +
    labs(caption = paste0("kruskal adj.p = ", round(pval, 4))) +
    ylab("progeny score") +
    ggsignif::geom_signif(
      comparisons = comp_groups,
      step_increase = 0.06, map_signif_level = T, show.legend = , color = "black"
    )

  if (!is.null(color_scheme)) {
    p <- p + scale_fill_manual(values = as_vector(color_scheme))
  }

  ggsave(filename = file.path(plot_path, glue::glue("{pathway}_boxplot.svg")))
  plot(p)
}

R Markdown ggplot2 From line 163 of RMD_scripts/progeny_analysis.Rmd

PathwayActivity_zscore <- progeny(diffexp_matrix,
  scale = TRUE, organism = org_name, top = 100, perm = 10000, z_scores = TRUE, verbose = T
) %>%
  t()
colnames(PathwayActivity_zscore) <- "NES"

R Markdown From line 197 of RMD_scripts/progeny_analysis.Rmd

PathwayActivity_zscore_df <- as.data.frame(PathwayActivity_zscore) %>%
  rownames_to_column(var = "Pathway") %>%
  dplyr::arrange(NES) %>%
  dplyr::mutate(Pathway = factor(Pathway))

NES_plot <- ggplot(PathwayActivity_zscore_df, aes(x = reorder(Pathway, NES), y = NES)) +
  geom_bar(aes(fill = NES), stat = "identity") +
  scale_fill_gradient2(
    low = "darkblue", high = "indianred",
    mid = "whitesmoke", midpoint = 0
  ) +
  theme_minimal() +
  theme(
    axis.title = element_text(face = "bold", size = 12),
    axis.text.x =
      element_text(angle = 45, hjust = 1, size = 10, face = "bold"),
    axis.text.y = element_text(size = 10, face = "bold"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  xlab("Pathways") +
  ggtitle(contrast_name)

ggsave(filename = file.path(plot_path, "NES_plot.svg"), NES_plot)
NES_plot

R Markdown ggplot2 From line 205 of RMD_scripts/progeny_analysis.Rmd

prog_matrix <- getModel(org_name, top = 100) %>%
  as.data.frame() %>%
  tibble::rownames_to_column("GeneID")

ttop_KOvsWT_df <- diffexp_matrix %>%
  as.data.frame() %>%
  tibble::rownames_to_column("GeneID")


scatter_plots <- progeny::progenyScatter(
  df = ttop_KOvsWT_df,
  weight_matrix = prog_matrix,
  statName = "stat", verbose = T
)

yeet <- purrr::map2(scatter_plots[[1]], names(scatter_plots[[1]]), function(x, n) ggsave(filename = file.path(plot_path, paste0(n, "_scatter.svg")), plot = x))
purrr::map(scatter_plots[[1]], plot)

R Markdown From line 235 of RMD_scripts/progeny_analysis.Rmd

net <- get_progeny(organism = org_name, top = 100)

minsize <- ifelse(nrow(Normalized_counts) > 1000, 5, 2)
times <- ifelse(nrow(Normalized_counts) > 1000, 1000, 100)

sample_acts <- decoupleR::run_wmean(
  mat = Normalized_counts, net = net, .source = "source", .target = "target",
  .mor = "weight", times = times, minsize = minsize
)

R Markdown From line 262 of RMD_scripts/progeny_analysis.Rmd

# Transform to wide matrix
sample_acts_mat <- sample_acts %>%
  filter(statistic == "norm_wmean") %>%
  pivot_wider(
    id_cols = "condition", names_from = "source",
    values_from = "score"
  ) %>%
  column_to_rownames("condition") %>%
  as.matrix()

# Scale per sample
sample_acts_mat <- scale(sample_acts_mat) %>% t()
sample_acts_mat <- sample_acts_mat[, rownames(dds_obj@colData)]

if (!is.null(color_scheme)) {
  ha <- HeatmapAnnotation(
    group = dds_obj@colData[, cond_id],
    col = list(
      group =
        as_vector(color_scheme)
    )
  )
} else {
  ha <- HeatmapAnnotation(group = dds_obj@colData[, cond_id])
}

# Plot

ComplexHeatmap::Heatmap(sample_acts_mat,
  top_annotation = ha, clustering_method_columns = "average",
  clustering_method_rows = "average", show_column_names = F
)

R Markdown From line 274 of RMD_scripts/progeny_analysis.Rmd

as_tibble(sample_acts_mat) -> out_yeet

yeet <- apply(sample_acts_mat, 1, kruskal.test, g = as.factor(ann_col))

kruskal_test <- purrr::map_df(yeet, tidy, .id = "pathway") %>%
  mutate(padj = p.adjust(p.value, method = "BH")) %>%
  arrange(desc(statistic))
plot_activity <- sample_acts_mat %>%
  t() %>%
  as.data.frame()
plot_activity[, cond_id] <- ann_col

for (pathway in kruskal_test$pathway) {
  pval <- kruskal_test %>%
    dplyr::filter(pathway == !!pathway) %>%
    pull(padj)
  p <- ggplot(plot_activity, aes(x = !!sym(cond_id), y = !!sym(pathway), fill = !!sym(cond_id))) +
    geom_boxplot(outlier.shape = NA, color = "black") +
    geom_jitter(width = 0.1) +
    theme_bw() +
    geom_hline(yintercept = 0) +
    ggtitle(pathway) +
    labs(caption = paste0("kruskal adj.p = ", round(pval, 4))) +
    ylab("progeny score") +
    ggsignif::geom_signif(
      comparisons = comp_groups,
      step_increase = 0.06, map_signif_level = T, show.legend = , color = "black"
    )
  if (!is.null(color_scheme)) {
    p <- p + scale_fill_manual(values = as_vector(color_scheme))
  }
  ggsave(filename = file.path(plot_path, glue::glue("{pathway}_decoupler_boxplot.svg")))
  plot(p)
}

R Markdown ggplot2 From line 311 of RMD_scripts/progeny_analysis.Rmd

deg <- joined_df %>%
  dplyr::select(gname, stat) %>%
  dplyr::filter(!is.na(stat)) %>%
  column_to_rownames(var = "gname") %>%
  as.matrix() -> diffexp_matrix
design <- dds_obj@colData
contrast_acts <- run_wmean(
  mat = deg, net = net, .source = "source", .target = "target",
  .mor = "weight", times = times, minsize = minsize
)
# Filter norm_wmean
f_contrast_acts <- contrast_acts %>%
  filter(statistic == "norm_wmean")

# Plot
ggplot(f_contrast_acts, aes(x = reorder(source, score), y = score)) +
  geom_bar(aes(fill = score), stat = "identity") +
  scale_fill_gradient2(
    low = "darkblue", high = "indianred",
    mid = "whitesmoke", midpoint = 0
  ) +
  theme_minimal() +
  theme(
    axis.title = element_text(face = "bold", size = 12),
    axis.text.x =
      element_text(angle = 45, hjust = 1, size = 10, face = "bold"),
    axis.text.y = element_text(size = 10, face = "bold"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  xlab("Pathways")

R Markdown ggplot2 From line 348 of RMD_scripts/progeny_analysis.Rmd

plot_pathway <- function(net, pathway, deg) {
  df <- net %>%
    filter(source == pathway) %>%
    arrange(target) %>%
    mutate(ID = target, color = "3") %>%
    column_to_rownames("target")
  inter <- sort(intersect(rownames(deg), rownames(df)))
  df <- df[inter, ]
  df["t_value"] <- deg[inter, ]
  df <- df %>%
    mutate(color = if_else(weight > 0 & t_value > 0, "1", color)) %>%
    mutate(color = if_else(weight > 0 & t_value < 0, "2", color)) %>%
    mutate(color = if_else(weight < 0 & t_value > 0, "2", color)) %>%
    mutate(color = if_else(weight < 0 & t_value < 0, "1", color))

  ggplot(df, aes(x = weight, y = t_value, color = color)) +
    geom_point() +
    scale_colour_manual(values = c("red", "royalblue3", "grey")) +
    ggrepel::geom_label_repel(aes(label = ID)) +
    theme_minimal() +
    theme(legend.position = "none") +
    geom_vline(xintercept = 0, linetype = "dotted") +
    geom_hline(yintercept = 0, linetype = "dotted") +
    ggtitle(pathway)
}
purrr::map(unique(net$source), plot_pathway, net = net, deg = deg)

R Markdown ggplot2 From line 382 of RMD_scripts/progeny_analysis.Rmd

library(tibble)
if (!require(RNAscripts)) {
  devtools::install("./scripts/RNAscripts", upgrade = "never")
}
library(RNAscripts)
if (packageVersion("mitch") < "1.1.8") {
  devtools::install_github("markziemann/mitch")
}
# library(org.Mm.eg.db)
library(mitch)
library(tidyverse)
library(ensembldb)
library(AnnotationHub)
## Load the annotation resource.
ah <- AnnotationHub()
Gtf <- ah["AH28753"]
## Create a EnsDb database file from this.
DbFile <- ensDbFromAH(Gtf)
## We can either generate a database package, or directly load the data
edb <- EnsDb(DbFile)
library(msigdbr)
if (exists("snakemake")) {
  # Input Files
  diffexp_tables_paths <- snakemake@input[["tables"]]
  contrast_list <- snakemake@params[["contrasts"]]
  fpkm_path <- snakemake@input[["fpkm"]]
  # Output Files
  mitch_table <- snakemake@output[["mitch_table"]]
  mitch_rds <- snakemake@output[["mitch_rds"]]
  report_path <- snakemake@output[["mitch_report"]]
  # Parameters
  threads <- snakemake@threads
  enrichment_term_type <- "SYMBOL"
} else {
  BASE_ANALYSIS_DIR <- "/omics/odcf/analysis/OE0228_projects/VascularAging/rna_sequencing/cre_2022/"
  contrast_list <- c(
    "basal_cre_pos_vs_basal_cre_neg", "tumor_cre_pos_vs_tumor_cre_neg", "tumor_cre_pos_vs_basal_cre_pos",
    "tumor_cre_neg_vs_basal_cre_neg"
  )
  diffexp_tables_paths <- as.list(file.path(BASE_ANALYSIS_DIR, glue::glue("results/diffexp/{contrast_list}.diffexp.tsv")))


  fpkm_path <- file.path(BASE_ANALYSIS_DIR, "fpkm/all.tsv")
  threads <- 1
  report_path <- "big_yeeter.html"
  enrichment_term_type <- "SYMBOL"
}

## Read data
diff_exp_tables <- purrr::map(diffexp_tables_paths, readr::read_tsv, col_names = c(
  "gene_id", "baseMean", "logFoldChange",
  "lfcSE", "stat", "pvalue", "padj"
), skip = 1)
names(diff_exp_tables) <- contrast_list
diff_exp_frames <- purrr::map(diff_exp_tables, function(x) {
  gene_ids <- x$gene_id
  diff_exp_fr <- as.data.frame(x[, -1])
  rownames(diff_exp_fr) <- gene_ids
  diff_exp_fr
})
# names(diff_exp_frames) <- contrast_list Read FPKM file
fpkm <- readr::read_tsv(fpkm_path)

match_table <- fpkm %>%
  dplyr::select(c("gene", "gname"))
match_vec <- setNames(object = match_table$gname, nm = match_table$gene)


### Setupt mitch input data
prep_mitch_input <- function(deseq_list, e_term = "ENSEMBL") {
  # mitch_input_df <- mitch::mitch_import(diff_exp_frames, 'DESeq2', geneTable = as.data.frame(fpkm[,c('gene',
  # 'gname')]))
  mitch_input_df <- mitch::mitch_import(deseq_list, "DESeq2")
  gname_tb <- tibble::tibble(Transcript_id = rownames(mitch_input_df))
  if (e_term == "ENSEMBL") {
    gname_tb["gene_id"] <- stringr::str_extract(gname_tb$Transcript_id, "ENSMUSG[0-9]*")
  } else if (e_term == "SYMBOL") {
    gname_tb["gene_id"] <- match_vec[gname_tb$Transcript_id]
  }
  rownames(mitch_input_df) <- gname_tb$gene_id
  colnames(mitch_input_df) <- stringr::str_remove_all(colnames(mitch_input_df), pattern = "-")
  ifelse(anyDuplicated(gname_tb$gene_id), warning("Duplicated genes in mitch input dataframe."), print("no duplicates"))

  mitch_input_df
}
#' Uniquify a vector
#'
#' @param x Input factor to check
#' @return corrected facotr value
#' @examples
#' NULL
uniquify <- function(x) {
  while (anyDuplicated(x)) {
    x[duplicated(x)] <- paste0(x[duplicated(x)], "_1")
  }
  x
}
mitch_input_df <- prep_mitch_input(diff_exp_frames)
# mm_reactome <- buildReactome(output_type = enrichment_term_type) ensembl_reactome <- buildReactome(output_type =
# 'ENSEMBL')


if (enrichment_term_type == "SYMBOL") {
  # Get GENE ids
  new_ids <- ensembldb::select(edb, keys = rownames(mitch_input_df), keytype = "GENEID", columns = c("SYMBOL", "GENEID"))
  update_vector <- setNames(new_ids$SYMBOL, nm = new_ids$GENEID)
  # Stop if any duplicated genes in reactome gene set
  names(match_vec) <- stringr::str_extract(names(match_vec), "ENSMUSG[0-9]*")

  geneIDs <- match_vec[rownames(mitch_input_df)]
  geneIDs[names(update_vector)] <- update_vector


  col_names <- c("gs_name", "gene_symbol")
  mm_msigdbr <- msigdbr::msigdbr(species = "Mus musculus", category = "H") %>%
    dplyr::select(col_names)
  msigdb_gene_list <- split(x = mm_msigdbr$gene_symbol, f = mm_msigdbr$gs_name)
  ensembl_in_msigdb <- new_ids %>%
    dplyr::filter(SYMBOL %in% mm_msigdbr$gene_symbol) %>%
    pull(GENEID)

  ## Cleanup duplicates
  mitch_input_df <- mitch_input_df[(rownames(mitch_input_df) %in% ensembl_in_msigdb) | !(duplicated(geneIDs[rownames(mitch_input_df)]) |
    duplicated(geneIDs[rownames(mitch_input_df)], fromLast = T)), ]

  geneIDs <- match_vec[rownames(mitch_input_df)]
  # geneIDs[names(update_vector)] <- update_vector
  dup_mitch <- mitch_input_df[] %>%
    as_tibble(rownames = "ENSEMBL")
  dup_mitch["SYMBOL"] <- geneIDs[dup_mitch$ENSEMBL]
  # Remove samples which(dup_mitch$ENSEMBL %in% unlist(ensembl_reactome))
  final_name_vec <- setNames(dup_mitch$SYMBOL, nm = dup_mitch$ENSEMBL)
  final_name_vec <- final_name_vec[-which(!(final_name_vec %in% mm_msigdbr$gene_symbol) & duplicated(final_name_vec))]
  # final_name_vec['ENSMUSG00000051396'] <- 'Gm45902'
  if (anyDuplicated(final_name_vec)) {
    stop("Duplicate in gene names")
  }
  # geneIDs <- uniquify(geneIDs)
  mitch_input_df <- mitch_input_df[names(final_name_vec), ]
  rownames(mitch_input_df) <- final_name_vec
}

## Setup Databases to test msig_hallmarck <- msigdbr::msigdbr(species = 'Mus musculus')

# bain <- msig_hallmarck %>% dplyr::select(c('gs_name', 'gene_symbol')) %>% group_split(gs_name)

# more_bain <- purrr::map(bain, function(x){x$gene_symbol})

# mitch::mitch_calc(mitch_input_df, more_bain, cores = 6, priority ='significance')

# mm_reactome <- buildReactome(output_type = enrichment_term_type) print(colnames(mitch_input_df))
# print(anyDuplicated(colnames(mitch_input_df)))
reac_test <- mitch::mitch_calc(mitch_input_df, msigdb_gene_list, cores = threads)

readr::write_tsv(reac_test$enrichment_result, mitch_table)
saveRDS(reac_test, file = mitch_rds)

mitch::mitch_report(reac_test, outfile = report_path)

R Snakemake tidyverse DESeq2 org.Mm.eg.db tibble AnnotationHub msigdbr ensembldb mitch From line 4 of scripts/run_mitch.R

library(progeny)

library(CARNIVAL)
library(OmnipathR)
library(readr)
library(tibble)
library(tidyr)
library(dplyr)
library(visNetwork)
library(biomaRt)
library(ggplot2)
library(pheatmap)
library(BiocParallel)
library(DESeq2)
library(decoupleR)
library(RNAscripts)
# library(biomaRt) Basic function to convert human to mouse gene names human = useMart(biomart= 'ensembl', dataset =
# 'hsapiens_gene_ensembl') mouse = useMart(biomart = 'ensembl', dataset = 'mmusculus_gene_ensembl')

mouse_human_homologs <- readr::read_tsv("http://www.informatics.jax.org/downloads/reports/HMD_HumanPhenotype.rpt", col_names = c(
  "hgene",
  "hID", "mgene", "mID", "lcol"
))


if (exists("snakemake")) {
  dds_path <- snakemake@input[["dds_obj"]]
  diffexp_tb_path <- snakemake@input[["table"]]
  fpkm_path <- snakemake@input[["fpkm"]]
  carnival_output <- snakemake@output[["carnival_out"]]
  contrast_groups <- snakemake@wildcards[["contrast"]]
  s_groups <- snakemake@params[["s_groups"]]
  register(MulticoreParam(snakemake@threads))
  contrast_name <- contrast_groups
  plot_path <- snakemake@params[["plot_path"]]
  comp_groups <- snakemake@config[["signif_comp"]]
  color_scheme <- snakemake@config[["group_colors"]]
  cplex_path <- snakemake@config[["cplex_solver"]]
  stopifnot(`Cplexpath doesnt exist please give path` = file.exists(cplex_path))
  temp_path <- snakemake@params[["temp_path"]]
  nr <- snakemake@resources[["nr"]]
  thread_num <- snakemake@threads
  mem_mb <- snakemake@resources[["mem_mb"]]
  time_limit <- (snakemake@resources[["time_min"]] - 20) * 60
  run_vanilla <- snakemake@params[["run_vanilla"]]
  perturbation_gene <- snakemake@params[["perturbation_gene"]]
  progeny_data <- "../data/progenyMembers.RData"
} else {
  BASE_ANALYSIS_DIR <- "/omics/odcf/analysis/OE0228_projects/VascularAging/rna_sequencing/APLNR_KO"
  dds_path <- file.path(paste0(BASE_ANALYSIS_DIR), "deseq2/all.rds")
  diffexp_tb_path <- file.path(paste0(BASE_ANALYSIS_DIR), "results/diffexp/condition/AplnKO_vs_basal.diffexp.tsv")
  fpkm_path <- file.path(BASE_ANALYSIS_DIR, "fpkm/all.tsv")
  contrast_groups <- c("AplnKO", "basal")
  plot_path <- "."
  register(SerialParam())
  s_groups <- c("AplnKO", "basal")
  contrast_name <- glue::glue("{contrast_groups[[1]]} vs {contrast_groups[[2]]}")
  the_yaml <- yaml::read_yaml("../configs/VascAge_APLN_KO.yaml")
  comp_groups <- the_yaml$comp_groups
  color_scheme <- the_yaml$group_colors
  carnival_output <- "./test_output.RDS.gz"
  cplex_path <- "/home/heyer/software/external/CPLEX_Studio201/cplex/bin/x86-64_linux/cplex"
  thread_num <- 6
  temp_path <- "./"
  mem_mb <- 8192
  time_limit <- 3600
  run_vanilla <- TRUE
  progeny_data <- "./data/progenyMembers.RData"
}

# Read DESeq2 oobject and other tables
dds_obj <- readRDS(dds_path)
diffexp_tb <- read_tsv(diffexp_tb_path,
  col_names = c("gene_id", "baseMean", "logFoldChange", "lfcSE", "stat", "pvalue", "padj"),
  skip = 1
)
# Normalized_counts <- getVarianceStabilizedData(dds_obj)
Normalized_counts <- assay(vst(dds_obj, blind = F))
fpkm <- read_tsv(fpkm_path)


filer <- fpkm %>%
  dplyr::filter(gene %in% rownames(Normalized_counts)) %>%
  dplyr::filter(!duplicated(gname))

joined_df <- join_tables(diffexp_tb, filer) %>%
  dplyr::filter(!duplicated(gname))
Normalized_counts <- Normalized_counts[filer$gene, ]

rownames(Normalized_counts) <- filer$gname

joined_df %>%
  dplyr::select(gname, stat) %>%
  dplyr::filter(!is.na(stat)) %>%
  column_to_rownames(var = "gname") %>%
  as.matrix() -> diffexp_matrix




# regulons <- dorothea_mm %>% dplyr::filter(confidence %in% c('A', 'B'))
organism <- "mouse"
doro_net <- decoupleR::get_dorothea(organism = organism, levels = c("A", "B", "C"))
prog_net <- decoupleR::get_progeny(organism = organism, top = 100)

PathwayActivity <- PathwayActivity_CARNIVALinput <- run_wmean(
  mat = diffexp_matrix, network = prog_net, .source = "source",
  .target = "target", .mor = "weight", times = 1000, minsize = 5
) %>%
  dplyr::filter(statistic == "wmean") %>%
  as.data.frame()
if (any(abs(PathwayActivity$score) > 1)) {
  PathwayActivity$score <- sign(PathwayActivity$score) * (1 - PathwayActivity$p_value)
  warning("decoupler based enriched failed, falling back on (1-pvalue) * sign(score)")
}
# PathwayActivity <- PathwayActivity_CARNIVALinput <- progeny(diffexp_matrix, scale = TRUE, organism = 'Mouse', top =
# 100, perm = 10000, z_scores = F ) %>% t() %>% as.data.frame() %>% tibble::rownames_to_column(var = 'Pathway')

# colnames(PathwayActivity)[2] <- 'score'
progeny_key <- setNames(object = PathwayActivity$score, nm = PathwayActivity$source)

prog_net %>%
  mutate(progeny_activity = recode(source, !!!progeny_key)) %>%
  mutate(carnival_score = sign(weight) * progeny_activity) -> prog_net
prog_net %>%
  group_by(target) %>%
  summarise(carnival_score = mean(carnival_score)) -> prog_net_final

tf_activities_stat <- decoupleR::run_wmean(diffexp_matrix, network = doro_net, times = 1000, minsize = 5) %>%
  filter(statistic == "norm_wmean")
# options = list( minsize = 5, eset.filter = FALSE, cores = 1, verbose = FALSE, nes = TRUE ) )

prog_net_final %>%
  filter(!(target %in% tf_activities_stat$source)) -> prog_net_final

tf_activities <- tf_activities_CARNIVALinput <- tf_activities_stat %>%
  dplyr::select(source, score)


## Get Omnipath
omniR <- import_omnipath_interactions(organism = 10090)
# omniR <- import_pathwayextra_interactions(organism = 10090) signed and directed
omnipath_sd <- omniR %>%
  dplyr::filter(consensus_direction == 1 & (consensus_stimulation == 1 | consensus_inhibition == 1))

# changing 0/1 criteria in consensus_stimulation/inhibition to -1/1
omnipath_sd$consensus_stimulation[which(omnipath_sd$consensus_stimulation == 0)] <- -1
omnipath_sd$consensus_inhibition[which(omnipath_sd$consensus_inhibition == 1)] <- -1
omnipath_sd$consensus_inhibition[which(omnipath_sd$consensus_inhibition == 0)] <- 1

# check consistency on consensus sign and select only those in a SIF format
sif <- omnipath_sd[, c("source_genesymbol", "consensus_stimulation", "consensus_inhibition", "target_genesymbol")] %>%
  dplyr::filter(consensus_stimulation == consensus_inhibition) %>%
  unique.data.frame()

sif$consensus_stimulation <- NULL
colnames(sif) <- c("source", "interaction", "target")

# remove complexes
sif$source <- gsub(":", "_", sif$source)
sif$target <- gsub(":", "_", sif$target)

# dorothea for CARNIVAL
tf_activities_carnival <- data.frame(tf_activities, stringsAsFactors = F)
rownames(tf_activities_carnival) <- tf_activities$source
tf_activities_carnival$source <- NULL
tf_list <- generateTFList(tf_activities_carnival, top = 50, access_idx = 1)
tf_vec <- tf_list$score[, ]

# For mouse change trp53 to tp53 and trp63 to tp63

names(tf_vec) <- gsub("^Trp", "Tp", names(tf_vec))

# progeny for CARNIVAL load(file = progeny_data) progenyMembers$gene$p53 <- 'TP53' progmem_mouse <-
# purrr::map(progenyMembers$gene, convertHumanGeneHomologs, jax_database = mouse_human_homologs) progenyMembers$gene <-
# progmem_mouse progenyMembers$gene$p53 <- 'Tp53'


# PathwayActivity_carnival <- data.frame(PathwayActivity, stringsAsFactors = F) rownames(PathwayActivity_carnival) <-
# PathwayActivity_carnival$Pathway PathwayActivity_carnival$Pathway <- NULL progenylist <- assignPROGENyScores( progeny
# = t(PathwayActivity_carnival), progenyMembers = progenyMembers, id = 'gene', access_idx = 1 ) progeny_vec <-
# progenylist$score
progeny_vec <- setNames(prog_net_final$carnival_score, nm = prog_net_final$target)
# get initial nodes

lp_opts <- CARNIVAL::defaultCplexCarnivalOptions(
  solverPath = cplex_path, cplexMemoryLimit = mem_mb, threads = thread_num,
  timelimit = time_limit, lpFilename = file.path(temp_path, "lptest.lp"), outputFolder = file.path(temp_path, "carnout")
)
cplex_opts <- suggestedCplexSpecificOptions()
lp_opts[names(cplex_opts)] <- cplex_opts
# lp_opts$solverPath <- cplex_path
lp_opts$cplexMemoryLimit <- mem_mb
lp_opts$threads <- thread_num
lp_opts$timelimit <- time_limit
# lp_opts$lpFilename <- file.path(temp_path, 'lptest.lp') lp_opts$outputFolder <- file.path(temp_path, 'carnout')
dir.create(lp_opts$outputFolder, showWarnings = F, recursive = T)

# run carnival
dir.create(file.path(temp_path, "carnout"), recursive = T, showWarnings = F)
# setwd(file.path(temp_path, 'carnout'))
if (run_vanilla) {
  carnival_result <- runVanillaCarnival(
    perturbations = c(perturbation_gene = 1), measurements = unlist(tf_vec), priorKnowledgeNetwork = sif,
    weights = unlist(progeny_vec), carnivalOptions = lp_opts
  )
} else {
  carnival_result <- runInverseCarnival(
    measurements = unlist(tf_vec), priorKnowledgeNetwork = sif, weights = unlist(progeny_vec),
    carnivalOptions = lp_opts
  )
}
if (length(carnival_result$sifAll) < 50) {
  if (nr >= 2) {
    write(paste0("Failed to converge after 2 attempts"), file = stderr())
    carnival_result <- list()
  } else {
    stop(glue::glue("Attempt {nr}. Failed to converge restarting"))
  }
}
saveRDS(carnival_result, file = carnival_output)

R Snakemake ggplot2 dplyr tidyr DESeq2 Consensus readr tibble biomaRt decoupleR OmnipathR CARNIVAL progeny visNetwork From line 3 of scripts/test_carnival.R

knitr::opts_chunk$set(echo = TRUE)

R Markdown From line 10 of transcriptutorial/06_analysis_CARNIVAL_results.Rmd

library(readr)
library(piano)
library(dplyr)
library(ggplot2)
library(tibble)
library(tidyr)
library(dplyr)
library(scales)
library(plyr)
library(GSEABase)
library(network)
library(reshape2)
library(cowplot)
library(pheatmap)
library(ggraph)
library(tidygraph)
if (!require(RNAscripts)) {
  devtools::install("../scripts/RNAscripts", upgrade = "never", quiet = TRUE)
}
library(RNAscripts)
# set working directory
# setwd("~/Projects/transcriptutorial/scripts")
# source("./support_networks.r")
## We also load the support functions
if (exists("snakemake")) {
  carnival_sample_result <- snakemake@input[["carnival_sample_result"]]
  tt_basepath <- snakemake@params[["tutorial_source_path"]]

  names(carnival_sample_result) <- snakemake@params[["sample_names"]]
  stopifnot(length(carnival_sample_result) > 0)
  workflow_config <- snakemake@config

  sample_table <- readr::read_tsv(workflow_config$samples)
  organism <- workflow_config$organism
} else {
  BASE_DIR <- "/omics/odcf/analysis/OE0228_projects/VascularAging/rna_sequencing/tec_aplnrKO/results/carnival/sample_resolution"
  workflow_config <- yaml::read_yaml("../../../configs/tec_aplnrKO.yaml")
  sample_table <- readr::read_tsv("../../../data/tumor_vs_ec/Aplnr_KO.tsv")
  sample_ids <- sample_table %>% pull(sample)
  carnival_sample_result <- file.path(BASE_DIR, paste0(sample_ids, "_carn_res.RDS.gz"))
  names(carnival_sample_result) <- sample_ids
  base_path <- "/desktop-home/heyer/projects/Vascular_Aging"
  tt_basepath <- file.path(base_path, "RNAseq/rna-seq-star-deseq2/workflow/scripts/transcriptutorial")
  organism <- "Mus musculus"
}
source(file.path(tt_basepath, "support_enrichment.r"))
source(file.path(tt_basepath, "support_networks.r"))

carnival_sample_resolution <- purrr::map(carnival_sample_result, readRDS)

# Check if any are 0

length_index <- purrr::map(carnival_sample_resolution, length)
# Check if any are null
lgl_index <- purrr::map_int(carnival_sample_resolution, length) == 0
carnival_sample_resolution <- carnival_sample_resolution[!lgl_index]
stopifnot(length(carnival_sample_resolution) > 0)
process_carnival <- function(carnival_res) {
  carnival_res$weightedSIF <- carnival_res$weightedSIF %>% dplyr::filter(Weight != 0)
  #  carnival_res$nodesAttributes <- carnival_res$nodesAttributes %>% dplyr::filter(AvgAct != 0)
  carnival_res
}

carnival_sample_resolution <- purrr::map(carnival_sample_resolution, process_carnival)

R Markdown Snakemake ggplot2 dplyr tidyr readr cowplot reshape2 tibble scales plyr ggraph GSEABase tidygraph network PIANO From line 63 of transcriptutorial/06_analysis_CARNIVAL_results.Rmd

# read CARNIVAL results
omnipath_id <- RNAscripts::get_organism_omnipath_id(organism)
omniR <- OmnipathR::import_omnipath_interactions(organism = omnipath_id)

# signed and directed
omnipath_sd <- omniR %>% dplyr::filter(consensus_direction == 1 &
  (consensus_stimulation == 1 |
    consensus_inhibition == 1
  ))

# changing 0/1 criteria in consensus_stimulation/inhibition to -1/1
omnipath_sd$consensus_stimulation[which(omnipath_sd$consensus_stimulation == 0)] <- -1
omnipath_sd$consensus_inhibition[which(omnipath_sd$consensus_inhibition == 1)] <- -1
omnipath_sd$consensus_inhibition[which(omnipath_sd$consensus_inhibition == 0)] <- 1

# check consistency on consensus sign and select only those in a SIF format
sif <- omnipath_sd[, c("source_genesymbol", "consensus_stimulation", "consensus_inhibition", "target_genesymbol")] %>%
  dplyr::filter(consensus_stimulation == consensus_inhibition) %>%
  unique.data.frame()

sif$consensus_stimulation <- NULL
colnames(sif) <- c("source", "interaction", "target")

# remove complexes
sif$source <- gsub(":", "_", sif$source)
sif$target <- gsub(":", "_", sif$target)

R Markdown Consensus From line 136 of transcriptutorial/06_analysis_CARNIVAL_results.Rmd

# get only summary files from CARNIVAL results
sifts <- lapply(carnival_sample_resolution, function(x) {
  x$weightedSIF
})
nodos <- lapply(carnival_sample_resolution, function(x) {
  x$nodesAttributes
})
write(names(sifts), file = stderr())
# Calculate the number of edges and nodes in the networks and its density
node_edge <- do.call(rbind, lapply(sifts, count_edges_nodes_degree))


# Calculate degree distribution for a sample
yeet <- do.call(rbind, sifts) %>% unique()
count_degree <- yeet %>% degree_count()

# degree distribution
p <- data.frame(table(count_degree$total_count) / nrow(count_degree))
colnames(p) <- c("Var1", "total_degree")
p <- merge.data.frame(p, data.frame(table(count_degree$in_count) / nrow(count_degree)), all = T)
colnames(p) <- c("Var1", "total_degree", "in_degree")
p <- merge.data.frame(p, data.frame(table(count_degree$out_count) / nrow(count_degree)), all = T)
colnames(p) <- c("k", "total_degree", "in_degree", "out_degree")
p <- melt(p, value.name = "p", id.vars = "k")
p$k <- relevel(p$k, "0")

# visualise
ggdat <- as.data.frame(node_edge) %>%
  tibble::rownames_to_column(var = "sample") %>%
  dplyr::mutate(condition = gsub(".Rep[0-9]{1}", "", sample))

# Plotting

# relation between number of edges and nodes
ggplot(ggdat, aes(x = nodes, y = edges, color = as.factor(condition))) +
  geom_point() +
  geom_text(
    label = ggdat$sample,
    check_overlap = TRUE,
    vjust = 0,
    nudge_y = 0.5,
    show.legend = F
  ) +
  theme_bw(base_size = 15) +
  guides(color = guide_legend(title = "Conditions")) +
  ggtitle("Node-edge composition")

R Markdown ggplot2 From line 165 of transcriptutorial/06_analysis_CARNIVAL_results.Rmd

# degree distribution
levels(p$k) <- levels(p$k) %>%
  as.numeric() %>%
  sort()
dd <- ggplot(data = p, aes(x = k, y = p, group = variable, color = variable)) +
  geom_point() +
  geom_line() +
  theme_bw() +
  theme(legend.position = "none") +
  guides(color = guide_legend(title = "degree type")) +
  ggtitle("Degree distribution")

ddp <- ggplot(data = p, aes(x = as.numeric(k), y = p, group = variable, color = variable)) +
  geom_point() +
  geom_line() +
  scale_x_continuous(
    breaks = as.numeric(p$k),
    trans = scales::log_trans()
  ) +
  scale_y_log10(
    breaks = trans_breaks("log10", function(x) 10^x),
    labels = trans_format("log10", math_format(10^.x))
  ) +
  annotation_logticks() +
  theme_bw() +
  guides(color = guide_legend(title = "degree type")) +
  ggtitle("Degree distribution (log scale)") +
  xlab("k (ln)") +
  ylab("p (log10)")

plot_grid(dd, ddp, labels = "auto", rel_widths = c(1, 2))

R Markdown ggplot2 From line 214 of transcriptutorial/06_analysis_CARNIVAL_results.Rmd

```

R Markdown From line 261 of transcriptutorial/06_analysis_CARNIVAL_results.Rmd

get_common_interactions <- function(interaction_list, psmpl_per = 95,
                                    s_table = sample_table,
                                    condition_sel) {
  sample_id_list <- s_table %>%
    filter(condition %in% condition_sel) %>%
    pull(sample)
  interaction_list <- interaction_list[, sample_id_list]

  if (!is.null(dim(interaction_list))) {
    shared_interactions_WT <- getCoreInteractions(topology = interaction_list, psmpl = psmpl_per)

    # Visualise the interactions
    colnames(shared_interactions_WT) <- c("from", "sign", "to")
    labels_edge <- c("-1" = "inhibition", "1" = "activation")
    nodes <- data.frame(union(shared_interactions_WT$from, shared_interactions_WT$to))
    colnames(nodes) <- "nodes"
    nodes$label <- nodes$nodes

    gg_graph <- tidygraph::tbl_graph(nodes = nodes, edges = shared_interactions_WT) %>%
      ggraph::ggraph(layout = "auto") +
      geom_node_point(color = "#C0C0C0", size = 8) +
      geom_edge_link(arrow = arrow(), aes(edge_colour = as.factor(sign))) +
      theme_graph() +
      geom_node_text(aes(label = label), vjust = 0.4)
  } else {
    gg_graph <- NULL
  }
  gg_graph
}

R Markdown From line 284 of transcriptutorial/06_analysis_CARNIVAL_results.Rmd

# create a matrix of all interactions for all samples
write(dim(sif), file = stderr())
interactions <- getTopology(networks = sifts, scafoldNET = sif)
colnames(interactions) <- names(carnival_sample_resolution)
# FIxes bug in topology function (To lazy to actually fix)
ncol_interact <- ncol(interactions)
# interactions <- interactions[rowSums(interactions) > 0,]
# interactions <- interactions[,-c(1:ncol_interact/2)]
# get the edges per sample
# get the edges per sample
net_int <- apply(interactions, 2, function(x, r) {
  r[which(!is.na(x))]
}, rownames(interactions))

# calculate Jaccard indexes per pair
combined <- expand.grid(1:length(names(sifts)), 1:length(names(sifts)))
jac_index <- matrix(
  data = NA, nrow = length(names(sifts)), ncol = length(names(sifts)),
  dimnames = list(names(sifts), names(sifts))
)

for (i in 1:nrow(combined)) {
  n <- names(sifts)[combined[i, 1]]
  m <- names(sifts)[combined[i, 2]]
  jac_index[n, m] <- length(intersect(net_int[[n]], net_int[[m]])) / length(union(net_int[[n]], net_int[[m]]))
}

# Visualize the indexes in a heatmap

pheatmap::pheatmap(jac_index,
  fontsize = 14,
  fontsize_row = 10, fontsize_col = 10,
  angle_col = 45, treeheight_col = 0
)
corrplot::corrplot(jac_index, order = "hclust")

R Markdown From line 343 of transcriptutorial/06_analysis_CARNIVAL_results.Rmd

groups_to_check <- c(
  workflow_config$comp_groups,
  as.list(setNames(
    object = sample_table$condition %>% unique(),
    nm = sample_table$condition %>% unique()
  ))
)


yoted <- purrr::map(groups_to_check, get_common_interactions, interaction_list = interactions, psmpl_per = 60, s_table = sample_table[!lgl_index, ])
if (any(purrr::map(yoted, is.null))) {
  null_index <- purrr::map_lgl(yoted, is.null)
  yoted <- yoted[!null_index]
}
yoted <- purrr::map2(yoted, names(yoted), function(x, y) x + ggtitle(y))

yoted

R Markdown From line 391 of transcriptutorial/06_analysis_CARNIVAL_results.Rmd

build_matrix_from_nodes <- function(node_list, node_type = "AvgAct") {
  gene_index <- purrr::map(node_list, ~ pull(., Node)) %>%
    unlist() %>%
    unique()
  node_mat <- purrr::map(node_list, ~ dplyr::select(., Node, !!node_type)) %>% purrr::reduce(full_join, by = "Node")
  colnames(node_mat) <- c("Node", names(node_list))
  node_mat
}

avg_mat <- build_matrix_from_nodes(nodos)
rownames(avg_mat) <- avg_mat$Node
avg_mat <- subset(avg_mat, select = -c(Node)) %>% as.matrix()
non_zero_index <- apply(avg_mat, 1, function(x) length(which(x != 0)) >= 2)



ComplexHeatmap::Heatmap(avg_mat[non_zero_index, ], column_names_rot = 45, row_names_gp = grid::gpar(fontsize = 6), cluster_columns = FALSE)

R Markdown From line 411 of transcriptutorial/06_analysis_CARNIVAL_results.Rmd

432	sessionInfo()

R Markdown From line 432 of transcriptutorial/06_analysis_CARNIVAL_results.Rmd

library(readr)
library(CARNIVAL)
if (!require(RNAscripts)) {
  devtools::install("../scripts/RNAscripts", upgrade = "never", quiet = TRUE)
}
library(RNAscripts)
library(magrittr)
library(dplyr)
library(OmnipathR)
# files
if (exists("snakemake")) {
  tf_act_path <- snakemake@input[["tf_activities"]]
  path_act_path <- snakemake@input[["pathway_activities"]]
  cplex_path <- snakemake@config[["cplex_solver"]]
  carnival_sample_result <- snakemake@output[["carnival_sample_result"]]
  thread_num <- snakemake@threads
  time_limit <- (snakemake@resources[["time_min"]] - 20) * 60
  mem_mb <- snakemake@resources[["mem_mb"]]
  run_vanilla <- snakemake@params[["run_vanilla"]]
  temp_path <- snakemake@params[["temp_path"]]
  nr <- snakemake@resources[["nr"]]
  sample_id <- snakemake@wildcards[["sample"]]
  progeny_data <- "./workflow/data/progenyMembers.RData"
} else {
  base_path <- "/omics/odcf/analysis/OE0228_projects/VascularAging/rna_sequencing/APLN_KO"
  tf_act_path <- file.path(base_path, "dorothea/TF_act_sample_resolution.csv")
  path_act_path <- file.path(base_path, "progeny/sample_progney.csv")
  carnival_output <- "./test_output.RDS.gz"
  cplex_path <- "/home/heyer/software/external/CPLEX_Studio201/cplex/bin/x86-64_linux/cplex"
  thread_num <- 4
  temp_path <- file.path(base_path, "results/carnival/temp/VascAge_Young-EC-Plus-5")
  mem_mb <- 8192
  time_limit <- 3600
  run_vanilla <- FALSE
  sample_id <- "VascAge_Young-EC-Plus-5"
  progeny_data <- "./workflow/data/progenyMembers.RData"
  nr <- 1
}
if (nr == 4) {
  saveRDS(list(), file = carnival_sample_result)
  write("Failed to converge", file = stderr())
  quit(save = "no")
}
sample_id <- stringr::str_replace_all(sample_id, pattern = "-", replacement = ".")

tf_activities <- read_csv(tf_act_path)
pathway_activity <- read_csv(path_act_path)

mouse_human_homologs <- readr::read_tsv("http://www.informatics.jax.org/downloads/reports/HMD_HumanPhenotype.rpt",
  col_names = c("hgene", "hID", "mgene", "mID", "lcol")
)



## Get Omnipath
get_organism_number <- function(organism_name) {
  if (organism_name == "human") {
    organism_number <- 9606
  } else if (organism_name == "mouse") {
    organism_number <- 10090
  }
  organism_number
}
organism <- "mouse"
org_nb <- get_organism_number(organism)
omniR <- import_omnipath_interactions(organism = org_nb)


# signed and directed
omnipath_sd <- omniR %>% dplyr::filter(consensus_direction == 1 &
  (consensus_stimulation == 1 |
    consensus_inhibition == 1
  ))

# changing 0/1 criteria in consensus_stimulation/inhibition to -1/1
omnipath_sd$consensus_stimulation[which(omnipath_sd$consensus_stimulation == 0)] <- -1
omnipath_sd$consensus_inhibition[which(omnipath_sd$consensus_inhibition == 1)] <- -1
omnipath_sd$consensus_inhibition[which(omnipath_sd$consensus_inhibition == 0)] <- 1

# check consistency on consensus sign and select only those in a SIF format
sif <- omnipath_sd[, c(
  "source_genesymbol", "consensus_stimulation",
  "consensus_inhibition", "target_genesymbol"
)] %>%
  dplyr::filter(consensus_stimulation == consensus_inhibition) %>%
  unique.data.frame()

sif$consensus_stimulation <- NULL
colnames(sif) <- c("source", "interaction", "target")

# remove complexes
sif$source <- gsub(":", "_", sif$source)
sif$target <- gsub(":", "_", sif$target)





# dorothea for CARNIVAL
tf_activities_carnival <- data.frame(tf_activities, stringsAsFactors = F)
rownames(tf_activities_carnival) <- tf_activities$source
tf_activities_carnival$source <- NULL
tfList <- generateTFList(tf_activities_carnival,
  top = 50,
  access_idx = 1:ncol(tf_activities_carnival)
)

# progeny for CARNIVAL
load(file = progeny_data)
# progenyMembers$gene$p53 <- "TP53"
if (organism == "mouse") {
  progmem_mouse <- purrr::map(progenyMembers$gene, convertHumanGeneHomologs, mouse_human_homologs)
  progenyMembers$gene <- progmem_mouse
  progenyMembers$gene$p53 <- "Tp53"
  names(progenyMembers$gene)[names(progenyMembers$gene) == "JAK.STAT"] <- "JAK-STAT"
} else {
  progenyMembers$gene$p53 <- "TP53"
}
pathway_activity_carnival <- data.frame(pathway_activity, stringsAsFactors = F)
rownames(pathway_activity_carnival) <- pathway_activity_carnival %>% dplyr::pull(pathways)
# pathway_activity_carnival <- pathway_activity_carnival[,-c()]
pathway_activity_carnival$pathways <- NULL
progenylist <- assignPROGENyScores(
  progeny = t(pathway_activity_carnival),
  progenyMembers = progenyMembers,
  id = "gene",
  access_idx = 1:ncol(pathway_activity_carnival)
)
# get initial nodes


# run carnival for all samples
dir.create(file.path(temp_path, "carnout"), recursive = T, showWarnings = F)
# setwd(file.path(temp_path, "carnout"))
lp_opts <- CARNIVAL::defaultCplexCarnivalOptions(
  solverPath = cplex_path,
  cplexMemoryLimit = mem_mb,
  threads = thread_num,
  timelimit = time_limit,
  lpFilename = file.path(temp_path, "lptest.lp"),
  outputFolder = file.path(temp_path, "carnout")
)
cplex_opts <- suggestedCplexSpecificOptions()
lp_opts[names(cplex_opts)] <- cplex_opts
# lp_opts$solverPath <- cplex_path
lp_opts$cplexMemoryLimit <- mem_mb
lp_opts$threads <- thread_num
lp_opts$timelimit <- time_limit
if (run_vanilla) {
  carnival_result <- runVanillaCarnival(
    perturbations = c("Aplnr" = 1, "Apln" = 1),
    measurements = unlist(tfList[[sample_id]]),
    priorKnowledgeNetwork = sif,
    weights = unlist(progenylist[[sample_id]]),
    carnivalOptions = lp_opts
  )
} else {
  carnival_result <- runInverseCarnival(
    measurements = unlist(tfList[[sample_id]]),
    priorKnowledgeNetwork = sif,
    weights = unlist(progenylist[[sample_id]]),
    carnivalOptions = lp_opts
  )

  # transoform to data.frame
  #  carnival_result$weightedSIF <- data.frame(carnival_result$weightedSIF,
  #  stringsAsFactors = F)
  #  carnival_result$weightedSIF$Sign <- as.numeric(carnival_result$weightedSIF$Sign)
  #  carnival_result$weightedSIF$Weight <- as.numeric(carnival_result$weightedSIF$Weight)

  #  carnival_result$nodesAttributes <- data.frame(carnival_result$nodesAttributes, stringsAsFactors = F)
  #  carnival_result$nodesAttributes$ZeroAct <- as.numeric(carnival_result$nodesAttributes$ZeroAct)
  #  carnival_result$nodesAttributes$UpAct <- as.numeric(carnival_result$nodesAttributes$UpAct)
  #  carnival_result$nodesAttributes$DownAct <- as.numeric(carnival_result$nodesAttributes$DownAct)
  #  carnival_result$nodesAttributes$AvgAct <- as.numeric(carnival_result$nodesAttributes$AvgAct)
}
if (length(carnival_result$sifAll) < 50) {
  if (nr >= 2) {
    write(paste0("Failed to converge after 2 attempts"), file = stderr())
    carnival_result <- list()
  } else {
    stop("Failed to converge restarting")
  }
}

saveRDS(carnival_result, carnival_sample_result)

R Snakemake dplyr Consensus readr magrittr OmnipathR CARNIVAL From line 1 of transcriptutorial/sample_resolution_carnival.R

library(readr)
library(viper)
library(DESeq2)
library(dorothea)
library(RNAscripts)
library(magrittr)

# put whatever is your working directory here
if (exists("snakemake")) {
  dds_obj_rlog <- snakemake@input[["dds_obj"]]
  fpkm_path <- snakemake@input[["fpkm"]]
  outpath <- snakemake@output[["sample_dorothea_table"]]
  threads <- snakemake@threads
} else {
  BASE_PATH <- "/omics/odcf/analysis/OE0228_projects/VascularAging/rna_sequencing/apelin_exp"
  dds_obj_rlog <- file.path(BASE_PATH, "deseq2/rlog_transform.RDS.gz")
  fpkm_path <- file.path(BASE_PATH, "fpkm/all.tsv")
  threads <- 1
}



# count_df_rlog <- as.data.frame(read_csv("data/count_df_rlog.csv"))
count_df_rlog <- readRDS(dds_obj_rlog) %>%
  assay()
# row.names(count_df_rlog) <- count_df_rlog$gene

# count_df_rlog <- count_df_rlog[,-1]
# count_df_rlog <- count_df_rlog[complete.cases(count_df_rlog),]

fpkm <- read_tsv(fpkm_path)

rename_count_rownames <- function(count_table, fpkm_table) {
  filer <- fpkm_table %>%
    dplyr::filter(gene %in% rownames(count_table)) %>%
    dplyr::filter(!duplicated(gname))

  count_table <- count_table[filer$gene, ]
  rownames(count_table) <- filer$gname
  count_table
}

count_df_rlog <- rename_count_rownames(
  count_table = count_df_rlog,
  fpkm_table = fpkm
)

# regulons <- dorothea_mm %>% dplyr::filter(confidence %in% c("A", "B", "C"))
doro_net <- decoupleR::get_dorothea(organism = "mouse", levels = c("A", "B", "C"))

TF_activity <- decoupleR::run_wmean(count_df_rlog, network = doro_net, times = 1000, minsize = 5) %>%
  dplyr::filter(statistic == "norm_wmean") %>%
  dplyr::select(source, condition, score) %>%
  tidyr::pivot_wider(names_from = condition, values_from = score)

# TF_activity <- run_viper(count_df_rlog,
#  regulons = regulons,
#  options = list(
#    method = "scale", minsize = 4,
#    eset.filter = FALSE, cores = threads,
#    verbose = FALSE
#  )
# ) %>% as.data.frame()
### Preparing dorothea

## Dorothea/viper

# TF_activity$TF <- row.names(TF_activity)


readr::write_csv(TF_activity, outpath)

R Snakemake readr magrittr viper From line 1 of transcriptutorial/sample_resolution_dorothea.R

library(DESeq2)
library(readr)
library(viper)
library(progeny)
library(RNAscripts)
library(magrittr)
library(decoupleR)

if (exists("snakemake")) {
  dds_obj_rlog <- snakemake@input[["dds_obj"]]
  fpkm_path <- snakemake@input[["fpkm"]]
  outpath <- snakemake@output[["sample_progeny_table"]]
  threads <- snakemake@threads
} else {
  BASE_PATH <- "/omics/odcf/analysis/OE0228_projects/VascularAging/rna_sequencing/cre_2022"
  dds_obj_rlog <- file.path(BASE_PATH, "deseq2/rlog_transform.RDS.gz")
  fpkm_path <- file.path(BASE_PATH, "fpkm/all.tsv")
  threads <- 1
}


## Read and import data
count_df_rlog <- readRDS(dds_obj_rlog) %>%
  assay() %>%
  as.matrix()

fpkm <- read_tsv(fpkm_path)

# Rename rownames
print(rownames(count_df_rlog))
count_df_rlog <- rename_count_rownames(
  count_table = count_df_rlog,
  fpkm_table = fpkm
)

# Import progeny network and run decoupler wmean
organism <- "mouse"

prog_net <- decoupleR::get_progeny(organism = organism, top = 100)

PathwayActivity_counts <- decoupleR::run_wmean(count_df_rlog,
  network = prog_net,
  .mor = "weight",
  times = 1000
) %>%
  dplyr::filter(statistic == "norm_wmean") %>%
  dplyr::mutate(prog_score = (1 - p_value) * sign(score)) %>%
  tidyr::pivot_wider(
    id_cols = "condition", names_from = "source",
    values_from = "prog_score"
  ) %>%
  tibble::column_to_rownames("condition") %>%
  as.matrix()


PathwayActivity_counts <- as.data.frame(t(PathwayActivity_counts))
PathwayActivity_counts$pathways <- rownames(PathwayActivity_counts)

readr::write_csv(PathwayActivity_counts, outpath)

R Snakemake readr magrittr viper decoupleR progeny From line 1 of transcriptutorial/sample_resolution_progeny.R

__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from os import path

from snakemake.shell import shell


extra = snakemake.params.get("extra", "")
# Set this to False if multiqc should use the actual input directly
# instead of parsing the folders where the provided files are located
use_input_files_only = snakemake.params.get("use_input_files_only", False)

if not use_input_files_only:
    input_data = set(path.dirname(fp) for fp in snakemake.input)
else:
    input_data = set(snakemake.input)

output_dir = path.dirname(snakemake.output[0])
output_name = path.basename(snakemake.output[0])
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

shell(
    "multiqc"
    " {extra}"
    " --force"
    " -o {output_dir}"
    " -n {output_name}"
    " {input_data}"
    " {log}"
)