Code Snippets

Code for the manuscript "Machine learning reveals STAT motifs as predictors for GR-mediated gene repression"

suppressPackageStartupMessages(library(optparse, warn.conflicts=F, quietly=T))

option_list <- list(
  make_option(c( "--contrast_DexLPSvLPS"),
              type="character",
              help="Path to annotated tsv file of DeSeq2 contrast of DexLPS vs LPS"),
  make_option(c("--assignment_summit_prox"),
              type="character",
              help="Path to rds file of proximity based assignment of peak summits to genes"),
  make_option(c("--assignment_summits_abcregion_dexlps"),
              type="character",
              help="Path to rds file of assignment of peak summits within abcregions to genes (in DexLPS condition)"),
  make_option(c("--assignment_summits_abcregion_lps"),
              type="character",
              help="Path to rds file of assignment of peak summits within abcregions to genes (in LPS condition)"),
  make_option(c("--assignment_abcregion_dexlps"),
              type="character",
              help="Path to rds file of assignment of abcregions to genes (in DexLPS condition)"),
  make_option(c( "--assignment_abcregion_lps"),
              type="character",
              help="Path to rds file of assignment of abcregions to genes (in LPS condition)"),
  make_option(c("--motifcounts_summitregion"),
              type="character",
              help="Path to rds file of fimo motifcounts within summitregions"),
  make_option(c("--motifcounts_abcregion_dexlps"),
              type="character",
              help="Path to rds file of fimo motifcounts within ABC regions (in DexLPS condition)"),
  make_option(c("--motifcounts_abcregion_lps"),
              type="character",
              help="Path to rds file of fimo motifcounts within ABC regions (in LPS condition)"),
  make_option(c( "--outdir"),
              type="character",
              help="Path to output directory")
  )

opt <- parse_args(OptionParser(option_list=option_list))

dir.create(opt$outdir)

#change default for stringAsFactors
options(stringsAsFactors = FALSE)

suppressPackageStartupMessages(library(here, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(ggplot2, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(plyranges, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(org.Mm.eg.db, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(DESeq2, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(ComplexHeatmap, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(dplyr, warn.conflicts=F, quietly=T))

#set defaults for ggplot2 figures
theme_update(panel.background = element_rect(fill = "transparent", colour = NA),
             plot.background = element_rect(fill = "transparent", colour = NA),
             legend.background = element_rect(fill = "transparent", colour = NA),
             legend.key = element_rect(fill = "transparent", colour = NA),
             text=element_text(size=6, family = "ArialMT", colour="black"),
             title=element_text(size=8, family="ArialMT", colour="black"),
             panel.grid.major = element_line(colour="grey", size=0.2),
             panel.grid.minor = element_blank(),
             axis.text = element_text(size=6, family="ArialMT", colour="black"),
             axis.line = element_line(colour="black"),
             axis.ticks = element_line(colour="black"),
             legend.key.size = unit(6, 'points'), #change legend key size
             legend.key.height = unit(6, 'points'), #change legend key height
             legend.key.width = unit(6, 'points'), #change legend key width
             legend.text = element_text(size=6, family="ArialMT", colour="black"))

set.seed(12345)

#-------------------------------
## read in  data
#-------------------------------

contrast_DexLPSvLPS <- read.delim(opt$contrast_DexLPSvLPS)

for (optname in names(opt)[2:9]){ #except for the outdir
  print(paste0("Loading ", optname))
  assign(optname, readRDS( opt[[optname]] ))
}

#--------------------------------------------
## ---- function definitions
#--------------------------------------------

coerce_coef2df <- function(model_coef){
  model_coef <- as.matrix(model_coef)
  model_coef <- as.data.frame(model_coef)
  model_coef$names <- rownames(model_coef)
  colnames(model_coef) <- c("estimates", "names")
  return(model_coef)
}

#------------function will------------:
# * merge the motifdata with gene assignments and 
# * then merge the gene expression changes,
# * filter for DE genes and aggregate per gene
#--------------------------------------
merge_motifdata_with_assignments <- function( motifcounts, assignments, contrast, 
                                              maxonly=FALSE, excludepromoters=FALSE, weightby=FALSE, sepPromEnh=FALSE){

  # check if we should use all abc assignments, or only the max one of each peakID
  if(maxonly==TRUE){
    assignments <- assignments %>% group_by(name) %>% filter(abcscore==max(abcscore)) %>% distinct()
  } else{
    assignments <- assignments
  }

  if(excludepromoters=="all"){
    assignments <- assignments %>% filter(!class=="promoter")
  } else if (excludepromoters=="onlyNONself"){
    assignments <- assignments %>% filter(!c(class=="promoter" & isSelfPromoter=="False"))
  } else{
    assignments <- assignments
  }

  motifdf <- merge(motifcounts, assignments, 
                   by.x="name", by.y="name")

  motifdf <- motifdf %>% relocate(c(anno))

  # merge the expression change
  # use mgi_symbol for prox based, otherwise ensemblID
  # We DONT set all.x=TRUE because we don't care about predicting gene that aren't even expressed or that we don't have a clear label for
  if(maxonly=="prox"){
    motifdf <- merge(motifdf, contrast, 
                     by.x="anno", by.y="mgi_symbol")
  } else {
    motifdf <- merge(motifdf, contrast, 
                     by.x="anno", by.y="Row.names")
  }

  #recode the logFC and padj into a label (optionally through command line arguments)
  motifdf <- motifdf %>% 
    mutate(label=case_when(log2FoldChange>0.58 & padj < 0.05 ~ "up",
                           log2FoldChange<(-0.58) & padj < 0.05 ~ "down",
                           TRUE ~ "no_change")) %>%
    filter(label!="no_change") %>%
    mutate(label=factor(label,
                        levels=c("down","up"),
                        labels=c(0,1))) %>%
    relocate(label)

  # chr 2, 3 and 4 (20%) were used as the tuning set for hyperparameter tuning. 
  # Regions from chromosomes 1, 8 and 9 (20%) were used as the test set for performance evaluation 
  # The remaining regions were used for model training.

  #------aggregate over gene SYMBOL
  if ("abcscore" %in% colnames(motifdf)) { # loop for ABC based assignments

    if (weightby=="abcscore"){
      unselect_col <- "abcnumerator"
    } else if(weightby=="abcnumerator") {
      unselect_col <- "abcscore"
    } else{
      unselect_col <- c("abcscore","abcnumerator")
    }

    if (sepPromEnh==TRUE){
      motifdf_aggr <-
        motifdf %>% 
        dplyr::select(!c(unselect_col,"name","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj","gene_biotype","mgi_symbol")) %>%
        { if(weightby!=FALSE) mutate(.,across(where(is.numeric), ~ (.x * get(weightby)))) else .} %>% # weight features by score
        dplyr::select(!any_of(as.character(weightby))) %>% # then we can drop the score since it will be nonsensical after the aggregation anyways
        group_by(label,seqnames,anno,class) %>%
        dplyr::summarise(across( where(is.numeric), .fns=sum )) %>% # sum up genewise feature counts
        ungroup()

      # From here we need to cast the motifcounts for the promoterregions, so that in the end we have one row per gene (instead of 1-2)
      motifdf_aggr <- motifdf_aggr %>% tidyr::pivot_wider(id_cols=c(label,seqnames,anno), 
                                                          names_from=class, 
                                                          values_from = !c(label,seqnames,anno, class),
                                                          values_fill = 0)
    } else {
      motifdf_aggr <-
        motifdf %>% 
        dplyr::select(!c(unselect_col,"name","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj","gene_biotype","mgi_symbol")) %>%
        { if(weightby!=FALSE) mutate(., across(where(is.numeric), ~ (.x * get(weightby)))) else .} %>% # weight features by score
        dplyr::select(!any_of(as.character(weightby))) %>% # then we can drop the score since it will be nonsensical after the aggregation anyways
        group_by(label,seqnames,anno) %>%
        dplyr::summarise(across( where(is.numeric), .fns=sum )) %>% # sum up genewise feature counts
        ungroup()
    }


  } else { # loop for prox based assignments
    motifdf_aggr <-
      motifdf %>% 
      dplyr::select(!c("name","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj","gene_biotype")) %>% 
      group_by(label,seqnames,anno) %>%
      dplyr::summarise(across( where(is.numeric), .fns=sum )) %>% # sum up genewise feature counts
      ungroup()
  }

  return(motifdf_aggr)

}


#------------function needs------------:
# featurematrix and indexes to perform split of test vs trainval set
# (trainval set is used for CV to find optimal regularization)
#------------function will------------:
# run cv.glmnet (as elastic net regression) and pick lambda.1se as regularization
# determine model performance on test set
# plots for raw counts of certain factors are currently turned of
complete_GLM_analysis <- function(motifdf, trainvalidx, genenames){

  # Create training subset for model development & testing set for model performance testing
  # make it based on chromosomes, instead of completely random
  # as in bpnet, we set aside chr 1,8 and 9 for testing

  #inTrain <- sort(sample(nrow(motifdata_abc_aggr), nrow(motifdata_abc_aggr)*0.75))

  features_train <- motifdf[ trainvalidx, -c(1,2,3)] %>% as.matrix()
  features_test <- motifdf[ -trainvalidx, -c(1,2,3)] %>% as.matrix()
  targets_train <- motifdf[ trainvalidx, ] %>% pull(label) %>% as.numeric(levels(.))[.] %>% as.matrix()
  targets_test <- motifdf[ -trainvalidx, ] %>% pull(label) %>% as.numeric(levels(.))[.] %>% as.matrix()

  #------------------------
  ## Elastic net
  #------------------------

  cvfit_net <- glmnet::cv.glmnet(x=features_train, 
                                 y=targets_train, 
                                 family="binomial", 
                                 type.measure = "auc",
                                 nfolds = 6,
                                 alpha=0.5)


  # Performance on training data
  targets_train_net.prob <- predict(cvfit_net,
                                    type="response",
                                    newx = features_train,
                                    s = 'lambda.min')
  pred_train_net <- ROCR::prediction(targets_train_net.prob[,1], targets_train) #only need the probabilities for 1's

  auc_ROCR_train_net <- ROCR::performance(pred_train_net, measure = "auc") #to assess AUC for model
  auc_ROCR_train_net@y.values[[1]]

  # Predict on unseen data
  targets_net.prob <- predict(cvfit_net,
                              type="response",
                              newx = features_test,
                              s = 'lambda.min')
  pred_net <- ROCR::prediction(targets_net.prob[,1], targets_test) #only need the probabilities for 1's

  auc_ROCR_net <- ROCR::performance(pred_net, measure = "auc") #to assess AUC for model
  auc_ROCR_net@y.values[[1]]

  #------------------------
  ## Model coefficients
  #------------------------

  coef_df_net <- coerce_coef2df( coefficients(cvfit_net, s = 'lambda.min'))

  gg_net <- ggplot( data=coef_df_net %>% filter(abs(estimates)>0) )+
    geom_bar(aes(x=reorder(names, -estimates), y=estimates),
             stat="identity")+
    labs(title=" ",x="", y="Net coefs")+
    #coord_flip()+
    theme(axis.text.x = element_text(angle = 45, hjust=1),
          plot.margin = unit(c(0, 0, -6, 10), "points"))

  #------------------------
  # ROC curves
  #------------------------
  # calculate probabilities for TPR/FPR for predictions
  perf_train_net <- ROCR::performance(pred_train_net,"tpr","fpr")
  perf_net <- ROCR::performance(pred_net,"tpr","fpr")

  # plot ROC curve

  gg_ROC <- ggplot()+
    geom_line( aes(x=perf_train_net@x.values[[1]], y=perf_train_net@y.values[[1]], colour = "train_net") ) +
    geom_line( aes(x=perf_net@x.values[[1]],y=perf_net@y.values[[1]], colour = "test_net") ) +
    geom_abline(intercept=0,slope=1, linetype=4, colour="grey")+
    scale_colour_manual(values=c("darkblue", "blue"), 
                        name=" ",
                        breaks=c("train_net", "test_net"),
                        labels=c(paste("net train. AUC:",round( auc_ROCR_train_net@y.values[[1]], 2)) ,
                                 paste("net test. AUC:", round( auc_ROCR_net@y.values[[1]], 2))
                        )
    )+
    labs(x="False positive rate", y="True positive rate")+
    theme(
      legend.position=c(0.65,0.15)
    )

  # add metric to global variable
  new_metrics = data.frame(
    net_train = auc_ROCR_train_net@y.values[[1]],
    net_test = auc_ROCR_net@y.values[[1]] 
  )


  full_panel <- ggpubr::ggarrange(gg_ROC,gg_net, nrow=2,
                                  labels=c("A","B"),
                                  heights = c(1,1))

  results <- list()
  results[[1]] <- full_panel
  results[[2]] <- new_metrics
  results[[3]] <- coef_df_net
  results[[4]] <- cvfit_net
  return(results)
}



#------------------------------------------------
## initialize object to track performance metrics
#------------------------------------------------

AUC_metrics = data.frame(
  # parameter combination
  condition=character(),
  motifdata=character(),
  excludepromoters=character(),
  onlymax=character(),
  weight=character(),
  sepPromEnh=character(),
  # performance
  net_train = numeric(),
  net_test = numeric()
)

net_model_coefs = list()


#------------------------------------------------
## run proximity based
#------------------------------------------------

# This is independent of the ABC results (no need to loop through different assignment variations)

print("Running GLM on prox-based assignments")
my_rdsfile <- paste0(opt$outdir,"prox.rds")
if(!file.exists(my_rdsfile)){

  motifdata_aggr <- merge_motifdata_with_assignments(motifcounts_summitregion,
                                                     assignment_summit_prox,
                                                     contrast_DexLPSvLPS,
                                                     maxonly="prox",
                                                     excludepromoters=FALSE,
                                                     weightby = FALSE,
                                                     sepPromEnh = FALSE)
  motifdata_aggr_scaled <- motifdata_aggr %>% mutate(., across(where(is.numeric), ~(scale(.) %>% as.vector)))

  motifdata_aggr_tranval_idx <- motifdata_aggr_scaled %>% with(which(seqnames!="chr1" & seqnames!="chr8" & seqnames!="chr9"))

  performance <- complete_GLM_analysis (motifdata_aggr_scaled, motifdata_aggr_tranval_idx, genenames=genenames)

  #------------------------
  ## Raw counts
  #------------------------
  nr3c1counts <- motifdata_aggr %>% 
    group_by(label) %>% 
    dplyr::count(NR3C1) %>% 
    mutate(counts_fac = case_when(NR3C1>=1 ~ ">=1",
                                  NR3C1==0 ~ "0"), # aggregate it towards to top end
           counts_fac = factor(as.character(counts_fac), levels=c("0",">=1")) ) %>%
    group_by(counts_fac) %>% 
    mutate(freq = n / sum(n)) %>%
    group_by(label,counts_fac) %>%
    dplyr::summarize(sum_n=sum(n),
              sum_freq=sum(freq))

  gg_nr3c1counts <- ggplot(data=nr3c1counts , aes(x=counts_fac,y=sum_freq,fill=label)) + 
    geom_bar(stat="identity", position="dodge", alpha=0.5)+
    geom_text(aes(label=format(sum_freq, digits=2)),
              position = position_dodge(.9), 
              vjust = -0.5, 
              size = 3) + 
    scale_fill_manual(values=c("0"="blue", "1"="red"),
                      labels=c("DOWN","UP"))+
    labs(x="NR3C1 matches", y="% genes", fill="")+
    theme(
      plot.margin = unit(c(10, 6, 0, 10), "points")
    )
  gg_nr3c1counts

  relcounts <- motifdata_aggr %>% 
    group_by(label) %>% 
    dplyr::count(REL) %>% 
    mutate(counts_fac = case_when(REL>=1 ~ ">=1",
                                  REL==0 ~ "0"), # aggregate it towards to top end
           counts_fac = factor(as.character(counts_fac), levels=c("0",">=1")) ) %>%
    group_by(counts_fac) %>%
    mutate(freq = n / sum(n)) %>%
    group_by(label,counts_fac) %>%
    dplyr::summarize(sum_n=sum(n),
              sum_freq=sum(freq))

  gg_relcounts <- ggplot(data=relcounts , aes(x=counts_fac,y=sum_freq,fill=label)) + 
    geom_bar(stat="identity", position="dodge", alpha=0.5)+
    geom_text(aes(label=format(sum_freq, digits=2)),
              position = position_dodge(.9), 
              vjust = -0.5, 
              size = 3) + 
    scale_fill_manual(values=c("0"="blue", "1"="red"),
                      labels=c("DOWN","UP"))+
    labs(x="REL matches", y="% genes", fill="")+
    theme(
      plot.margin = unit(c(10, 6, 0, 10), "points")
    )
  gg_relcounts

  gg_rawcounts <-  ggpubr::ggarrange( gg_nr3c1counts, gg_relcounts, ncol=2, common.legend = TRUE, legend="bottom")
  saveRDS(gg_rawcounts, paste0(opt$outdir,"gg_rawcounts.rds"))

  #------------------------


  saveRDS(performance,
          file=my_rdsfile)
} else {
  performance <- readRDS(my_rdsfile)
}

plot(performance[[1]])

AUC_metrics <- rbind(AUC_metrics,
                     "prox" = c(
                       condition="dexlps",
                       motifdata="motifcounts_summitregion",
                       excludepromoters=FALSE,
                       onlymax="prox",
                       weight=FALSE,
                       sepPromEnh=FALSE,
                       performance[[2]]
                       )
                     )

net_model_coefs[["prox"]] <- performance[[3]]


## Raw counts
gg_firstgene <- ggplot() + 
  geom_bar(data=motifdata_aggr , aes(NR3C1,fill=label), position="dodge", alpha=0.5)+
  scale_fill_manual(values=c("0"="blue", "1"="red"),
                    labels=c("DOWN","UP"))+
  labs(x="NR3C1", y="counts", fill="")+
  theme(
    plot.margin = unit(c(10, 6, 0, 10), "points")
  )
gg_firstgene

#------------------------------------------------
## run example
#------------------------------------------------


# motifdata_aggr <- merge_motifdata_with_assignments(
#   motifcounts_summitregion,
#   assignment_summitregion_abc_dexlps,
#   contrast_DexLPSvLPS,
#   maxonly=FALSE, 
#   excludepromoters=FALSE, 
#   weightby=FALSE, 
#   sepPromEnh=FALSE)
# 
# motifdata_aggr_tranval_idx <- motifdata_aggr %>% with(which(seqnames!="chr1" & seqnames!="chr8" & seqnames!="chr9"))
# performance <- complete_GLM_analysis (motifdata_aggr, motifdata_aggr_tranval_idx, genenames=genenames)
# plot(performance[[1]])
# AUC_metrics <- rbind(AUC_metrics,
#                      "test" = c(condition="dexlps",
#                                 motifdata="motifcounts_peak",
#                                 onlymax=FALSE,
#                                 excludepromoters=FALSE,
#                                 weight=FALSE,
#                                 sepPromEnh=FALSE,
#                                 performance[[2]]))
# net_model_coefs[["test"]] <- performance[[3]]
# 
# gg_firstgene <- ggplot() + 
#   geom_bar(data=performance[[5]] , aes(NR3C1,fill=label), position="dodge", alpha=0.5)+
#   scale_fill_manual(values=c("0"="blue", "1"="red"),
#                     labels=c("DOWN","UP"))+
#   labs(x="NR3C1", y="counts", fill="")+
#   theme(
#     plot.margin = unit(c(10, 6, 0, 10), "points")
#   )
# gg_firstgene


#------------------------------------------------
## peakregion_counts - proxbased assignment
## peakregion_counts - abcbased assignment (both conditions)
## enhancerregion_counts - abcbased assignment (2 conditions)
#------------------------------------------------

# rewrite to assign values for both conditions and then use those to compute the difference as well

not_all_na <- function(x) any(!is.na(x))

for (motifdata in c("motifcounts_abcregion","motifcounts_summitregion")){
  for(excludepromoters in c(FALSE,"all","onlyNONself")){
    for (onlymax in c(TRUE,FALSE)){
      for (sepPromEnh in c(TRUE,FALSE)){
        for (weight in c(FALSE, "abcscore")){

          # set assignments fitting for the input data
          if(motifdata=="motifcounts_abcregion"){
            assignment_dexlps <- assignment_abcregion_dexlps
            assignment_lps <- assignment_abcregion_lps
            motifcounts_dexlps <- motifcounts_abcregion_dexlps
            motifcounts_lps <- motifcounts_abcregion_lps
          } else if (motifdata=="motifcounts_summitregion" ){
            # in this case the motifcounts for the 2 conditions are the same, but there assignments differ
            assignment_dexlps <- assignment_summits_abcregion_dexlps
            assignment_lps <- assignment_summits_abcregion_lps
            motifcounts_dexlps <- motifcounts_summitregion
            motifcounts_lps <- motifcounts_summitregion
          } else {break}

          #-----------------------DEXLPS------------------------------
          featurematrix_dexlps <- merge_motifdata_with_assignments(motifcounts_dexlps,
                                                                   assignment_dexlps,
                                                                   contrast_DexLPSvLPS,
                                                                   weightby=weight, 
                                                                   maxonly=onlymax, 
                                                                   excludepromoters=excludepromoters, 
                                                                   sepPromEnh=sepPromEnh)

          # remove columns that are NA after scaling 
          featurematrix_dexlps_scaled <- featurematrix_dexlps %>% 
            mutate(., across(where(is.numeric), ~(scale(.) %>% as.vector))) %>%
            dplyr::select(where(not_all_na))

          #-----

          motifdata_aggr_tranval_idx <- featurematrix_dexlps_scaled %>% with(which(seqnames!="chr1" & seqnames!="chr8" & seqnames!="chr9"))
          modelname <- paste(motifdata,"condition_dexlps_exclprom",excludepromoters,"onlymax",onlymax,"sepPromEnh",sepPromEnh,"weight",weight, sep="_")
          print(modelname)

          my_rdsfile <- here( paste0(opt$outdir, modelname,".rds"))
          if(!file.exists(my_rdsfile)){
            performance <- complete_GLM_analysis (featurematrix_dexlps_scaled, motifdata_aggr_tranval_idx, genenames=genenames)
            saveRDS(performance,file=my_rdsfile)
          } else {
            performance <- readRDS(my_rdsfile)
          }

          AUC_metrics <- rbind(AUC_metrics,
                               c(
                                 condition="dexlps",
                                 motifdata=motifdata,
                                 excludepromoters=excludepromoters,
                                 onlymax=onlymax,
                                 weight=weight,
                                 sepPromEnh=sepPromEnh,
                                 performance[[2]]
                                 )
                               )%>%
            magrittr::set_rownames(c(rownames(AUC_metrics),modelname))

          net_model_coefs[[modelname]] <- performance[[3]]

          #--------------------------LPS---------------------------
          featurematrix_lps <- merge_motifdata_with_assignments(motifcounts_lps,
                                                                assignment_lps,
                                                                contrast_DexLPSvLPS,
                                                                weightby=weight, 
                                                                maxonly=onlymax, 
                                                                excludepromoters=excludepromoters, 
                                                                sepPromEnh=sepPromEnh)
          featurematrix_lps_scaled <- featurematrix_lps %>% 
            mutate(., across(where(is.numeric), ~(scale(.) %>% as.vector))) %>%
            dplyr::select(where(not_all_na))
          #-----
          motifdata_aggr_tranval_idx <- featurematrix_lps_scaled %>% with(which(seqnames!="chr1" & seqnames!="chr8" & seqnames!="chr9"))
          modelname <- paste(motifdata,"condition_lps_exclprom",excludepromoters,"onlymax",onlymax,"sepPromEnh",sepPromEnh,"weight",weight, sep="_")
          print(modelname)

          my_rdsfile <- here(paste0(opt$outdir, modelname,".rds"))
          if(!file.exists(my_rdsfile)){
            performance <- complete_GLM_analysis (featurematrix_lps_scaled, motifdata_aggr_tranval_idx, genenames=genenames)
            saveRDS(performance,file=my_rdsfile)
          } else {
            performance <- readRDS(my_rdsfile)
          }

          AUC_metrics <- rbind(AUC_metrics,
                               c(
                                 condition="lps",
                                 motifdata=motifdata,
                                 excludepromoters=excludepromoters,
                                 onlymax=onlymax,
                                 weight=weight,
                                 sepPromEnh=sepPromEnh,
                                 performance[[2]]
                                 )
                               )%>%
            magrittr::set_rownames(c(rownames(AUC_metrics),modelname))

          net_model_coefs[[modelname]] <- performance[[3]]


          #-----------------------DIFFERENCE------------------------------

          # they contain the same motifs, but not the same genes. 
          # doublecheck that all columnnames are identical
          table(colnames(featurematrix_dexlps) == colnames(featurematrix_lps))

          #  motifcounts missing in one condition should be set to 0 

          merged_featurematrix <- merge(featurematrix_dexlps,
                                        featurematrix_lps, 
                                        by=c("anno","label","seqnames"),
                                        all=TRUE)

          # replace missing counts in one of the conditions with 0
          merged_featurematrix[is.na(merged_featurematrix)] <- 0

          # use dataframe suffix to grab respective columns
          featurematrix_diff <- 
            merged_featurematrix[grep(".x$",colnames(merged_featurematrix))] - 
            merged_featurematrix[grep(".y$",colnames(merged_featurematrix))] 
          # tidy up column names
          colnames(featurematrix_diff) <- gsub(".x$","", colnames(featurematrix_diff))

          # add first 3 columns back after computing the difference of the counts
          featurematrix_diff <- cbind(merged_featurematrix[1:3],featurematrix_diff)

          featurematrix_diff_scaled <- featurematrix_diff %>% 
            mutate(., across(where(is.numeric), ~(scale(.) %>% as.vector))) %>%
            dplyr::select(where(not_all_na))

          # run GLM and look at performance  
          featurematrix_diff_tranval_idx <- featurematrix_diff_scaled %>% with(which(seqnames!="chr1" & seqnames!="chr8" & seqnames!="chr9"))

          modelname <- paste(motifdata,"condition_DexLPS-LPS_exclprom",excludepromoters,"onlymax",onlymax,"sepPromEnh",sepPromEnh,"weight",weight, sep="_")
          print(modelname)
          my_rdsfile <- here(paste0(opt$outdir, modelname,".rds"))
          if(!file.exists(my_rdsfile)){
            performance <- complete_GLM_analysis (featurematrix_diff_scaled, featurematrix_diff_tranval_idx, genenames=genenames)
            saveRDS(performance,file=my_rdsfile)
          } else {
            performance <- readRDS(my_rdsfile)
          }

          AUC_metrics <- rbind(AUC_metrics,
                               c(
                                 condition="DexLPS-LPS",
                                 motifdata=motifdata,
                                 excludepromoters=excludepromoters,
                                 onlymax=onlymax,
                                 weight=weight,
                                 sepPromEnh=sepPromEnh,
                                 performance[[2]]
                                 )
                               )%>%
            magrittr::set_rownames(c(rownames(AUC_metrics),modelname))

          net_model_coefs[[modelname]] <- performance[[3]]


        }
      }
    }
  }
}


#------------------------------------------------
## put model coefs into dataframes
#------------------------------------------------
motifnames <- colnames(motifcounts_summitregion)[3:ncol(motifcounts_summitregion)]

# make empty dataframe with all motifs
model_coefs_joint <- data.frame(featurename=motifnames)

# when we treat promoters and enhancers separately, the featurenames differ
model_coefs_sep <- data.frame(featurename= c(paste(motifnames,"genic",sep="_"),
                                             paste(motifnames,"promoter",sep="_"),
                                             paste(motifnames,"intergenic",sep="_"))
)

for (model in names(net_model_coefs)){
  if (grepl("sepPromEnh_FALSE",model)){
    model_coefs_joint <- merge(model_coefs_joint,
                               net_model_coefs[[model]],
                               by.x="featurename", by.y="names",
                               all=TRUE) %>% 
      dplyr::rename(!! model := "estimates")
  } else if (model=="prox"){
    model_coefs_joint <- merge(model_coefs_joint,
                               net_model_coefs[[model]],
                               by.x="featurename", by.y="names",
                               all=TRUE) %>% 
      dplyr::rename(!! model := "estimates")
  } else if (grepl("sepPromEnh_TRUE",model)){
    model_coefs_sep <- merge(model_coefs_sep,
                             net_model_coefs[[model]],
                             by.x="featurename", by.y="names",
                             all=TRUE) %>% 
      dplyr::rename(!! model := "estimates")
  } else { stop("The modelnames don't match the expected pattern.") }
}

#------------------------------------------------
## save data for plotting
#------------------------------------------------
saveRDS (model_coefs_joint, paste0(opt$outdir, "model_coefs_joint.rds"))
saveRDS (model_coefs_sep, paste0(opt$outdir, "model_coefs_sep.rds"))
saveRDS (AUC_metrics, paste0(opt$outdir, "AUC_metrics.rds"))

#------------------------------------------------
sessionInfo()

R ggplot2 dplyr org.Mm.eg.db optparse here ComplexHeatmap FiMO plyranges From line 1 of scripts/abc_predictions.r

suppressPackageStartupMessages(library(optparse, warn.conflicts=F, quietly=T))

option_list <- list(
  make_option(c( "--ABC_DexLPS_all"),
              type="character",
              help="Path to file with all ABC predictions passing 0.02 (in DexLPS condition)"),
  make_option(c("--ABC_LPS_all"),
              type="character",
              help="Path to file with all ABC predictions passing 0.02 (in LPS condition)"),
  make_option(c("--contrast_DexVSDexLPS"),
              type="character",
              help="Path to annotated tsv file of DeSeq2 contrast of DexLPS vs LPS"),
  make_option(c("--chipseq_summits"),
              type="character",
              help="Path to summit file of IDR peaks"),
  make_option(c("--igv"),
              type="character",
              help="Path to IGV snapshot of genomic locus (created manually)")
)

opt <- parse_args(OptionParser(option_list=option_list))

options(stringsAsFactors = FALSE)

suppressPackageStartupMessages(library(dplyr, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(org.Mm.eg.db, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(DESeq2, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(ggplot2, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(ComplexHeatmap, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(TxDb.Mmusculus.UCSC.mm10.knownGene, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(topGO, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(ggExtra, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(plyr, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(here, warn.conflicts=F, quietly=T))

#set defaults for ggplot2 figures
theme_update(panel.background = element_rect(fill = "transparent", colour = NA),
             plot.background = element_rect(fill = "transparent", colour = NA),
             legend.background = element_rect(fill = "transparent", colour = NA),
             legend.key = element_rect(fill = "transparent", colour = NA),
             text=element_text(size=8, family = "ArialMT", colour="black"),
             title=element_text(size=10, family="ArialMT", colour="black"),
             panel.grid.major = element_blank(),
             panel.grid.minor = element_blank(),
             axis.text = element_text(size=8, family="ArialMT", colour="black"),
             axis.line = element_line(colour="black"),
             axis.ticks = element_line(colour="black"))

#----------------------------------------------------------------------
##--NOTE: example code for additional plots can be found in abc_visualizations.Rmd
#----------------------------------------------------------------------

#----------------------------------------------------------------------
##----------- load data
#----------------------------------------------------------------------

chipseq_summits <- rtracklayer::import.bed(opt$chipseq_summits)

contrast_DexLPSvLPS <- read.delim(opt$contrast_DexVSDexLPS)
# rename first column for merge later
colnames(contrast_DexLPSvLPS)[1] <- "EnsemblID"

ABC_DexLPS_all <- read.delim(opt$ABC_DexLPS_all)
ABC_LPS_all <- read.delim(opt$ABC_LPS_all)
# remove version number from ensembl IDs
ABC_DexLPS_all$TargetGene = gsub("\\.[0-9]*$","",ABC_DexLPS_all$TargetGene)
ABC_LPS_all$TargetGene = gsub("\\.[0-9]*$","",ABC_LPS_all$TargetGene)


#----------------------------------------------------------------------
##----------- data exploration
#----------------------------------------------------------------------

# What do the distributions of ABC scores look like

ggplot()+
  geom_density(data=ABC_DexLPS_all %>% filter(!class=="promoter"), aes(x=ABC.Score, colour="DexLPS"))+
  geom_density(data=ABC_LPS_all %>% filter(!class=="promoter"), aes(x=ABC.Score, colour="LPS"))+
  scale_colour_manual("",
                      breaks = c("DexLPS", "LPS"),
                      values = c("#339966", "#0066CC"))

# How do the ABC scores of promoters compare to those of enhancers?

ggplot()+
geom_density(data=ABC_DexLPS_all %>% filter(class=="promoter"), aes(x=ABC.Score, colour="promoter"))+
geom_density(data=ABC_DexLPS_all %>% filter(class=="genic"), aes(x=ABC.Score, colour="genic"))+
geom_density(data=ABC_DexLPS_all %>% filter(class=="intergenic"), aes(x=ABC.Score, colour="intergenic"))+
scale_colour_manual("",
                    breaks = c("promoter", "genic", "intergenic"),
                    values = c("red", "green", "blue"))

# What distance is used to define sth as promoter?

ggplot()+
  geom_density(data=ABC_DexLPS_all, aes(x=distance, colour=class))+
  scale_x_continuous(trans = "log10")+
  scale_colour_manual("",
                      breaks = c("promoter", "genic", "intergenic"),
                      values = c("red", "green", "blue"))

ggplot()+
  geom_density(data=ABC_DexLPS_all %>% filter(class=="promoter"), aes(x=distance, colour=isSelfPromoter))+
  scale_x_continuous(trans = "log10")+
  scale_colour_manual("isSelfPromoter",
                      breaks = c("True", "False"),
                      values = c("red", "blue"))+
  labs(title="All promoter regions, split by whether they're promoters for their target gene")+
  theme(
    plot.title = element_text(size=12)
  )
#----------------------------------------------------------------------
# ------ ------  Number of enhancers per gene
#----------------------------------------------------------------------
# Are most genes regulated by a single enhancer region or by multiple? How many does the ABC model assign per gene?

plot_enhancers_per_gene <- function(){
  DexLPS_TargetGeneCounts <- plyr::count(ABC_DexLPS_all %>% filter(!class=="promoter"), vars="TargetGene")
  LPS_TargetGeneCounts <- plyr::count(ABC_LPS_all %>% filter(!class=="promoter"), vars="TargetGene")

  gg <- ggplot()+
    geom_histogram(data=DexLPS_TargetGeneCounts, aes(x=freq, fill="DexLPS", colour="DexLPS"), binwidth = 1,  alpha=0.2)+
    geom_histogram(data=LPS_TargetGeneCounts, aes(x=freq, fill="LPS", colour="LPS"), binwidth = 1, alpha=0.2)+
    scale_fill_manual("",
                      breaks = c("DexLPS", "LPS"),
                      values = c("#339966", "#0066CC")) +
    scale_colour_manual("",
                        breaks = c("DexLPS", "LPS"),
                        values = c("#339966", "#0066CC")) +
    guides(color = "none")+
    theme(legend.position = c(0.8,0.8))+
    labs(title=" ",
         x="# of enhancers per gene",
         y="counts")

  return(gg)
}

gg_enh_per_gene <- plot_enhancers_per_gene()
gg_enh_per_gene

print("We find an average of")
plyr::count(ABC_DexLPS_all %>% filter(!class=="promoter"), vars="TargetGene") %>% pull(freq) %>% mean()
print("enhancers per gene for the DexLPS condition and")
plyr::count(ABC_LPS_all %>% filter(!class=="promoter"), vars="TargetGene") %>% pull(freq) %>% mean()
print("in LPS.")

#----------------------------------------------------------------------
# ------ ------  Number of genes per enhancer
#----------------------------------------------------------------------
# ABC allows for multiple assignments (an individual enhancer assigned to more than one promoter)  

plot_genes_per_enhancer <- function(){

  DexLPS_EnhancerCounts <- plyr::count(ABC_DexLPS_all %>% filter(!class=="promoter"),vars="name")
  LPS_EnhancerCounts <- plyr::count(ABC_LPS_all %>% filter(!class=="promoter"),vars="name")

  gg_full <- ggplot()+
    geom_histogram(data=DexLPS_EnhancerCounts, aes(x=freq, fill="DexLPS",colour="DexLPS"), binwidth = 1, alpha=0.2)+
    geom_histogram(data=LPS_EnhancerCounts, aes(x=freq, fill="LPS",colour="LPS"), binwidth = 1, alpha=0.2)+
    scale_fill_manual("",
                      breaks = c("DexLPS", "LPS"),
                      values = c("#339966", "#0066CC")) +
    scale_colour_manual("",
                        breaks = c("DexLPS", "LPS"),
                        values = c("#339966", "#0066CC")) +
    guides(color = FALSE)+
    labs(title="Number of genes an individual enhancer is assigned to",
         x="# of genes per enhancer",
         y="counts")
  plot(gg_full)

  gg_zoom <- ggplot()+
    geom_histogram(data=DexLPS_EnhancerCounts, aes(x=freq, fill="DexLPS",colour="DexLPS"), binwidth = 1, alpha=0.2)+
    geom_histogram(data=LPS_EnhancerCounts, aes(x=freq, fill="LPS",colour="LPS"), binwidth = 1, alpha=0.2)+
    scale_fill_manual("",
                      breaks = c("DexLPS", "LPS"),
                      values = c("#339966", "#0066CC")) +
    scale_colour_manual("",
                        breaks = c("DexLPS", "LPS"),
                        values = c("#339966", "#0066CC")) +
    xlim(c(0,8))+
    guides(color = FALSE)+
    theme(legend.position = c(0.8,0.8))+
    labs(title=" ",
         x="# of genes per enhancer",
         y="counts")
  plot(gg_zoom)

  print("We find an average of")
  DexLPS_EnhancerCounts %>% pull(freq) %>% mean() %>% print()
  print("genes per enhancer in the DexLPS condition and")
  LPS_EnhancerCounts %>% pull(freq) %>% mean() %>% print()
  print("in LPS.")

  return(gg_zoom)
}

gg_genes_per_enhancer_zoom <- plot_genes_per_enhancer()


#----------------------------------------------------------------------
# ------ Comparing ABC Scores (including promoter regions) between the conditions
#----------------------------------------------------------------------

# merge regions from both conditions but also keep those that are only in one
# 
# add info no whether the target genes are DE or now and what direction they're changed

LPS_GR<- plyranges::as_granges(ABC_LPS_all, seqnames=chr)
DexLPS_GR <- plyranges::as_granges(ABC_DexLPS_all, seqnames=chr)

#----------------------------------------------------------------------
merge_LPSandDexLPS_ABCdata <- function(ABC_LPS_all_GR, ABC_DexLPS_all_GR){
  ABC_LPS_all_unique <- unique(ABC_LPS_all_GR)
  ABC_DexLPS_all_unique <- unique(ABC_DexLPS_all_GR)

  # and then figure out which ones overlap with one another
  overlap_ABC_all_unique <- ChIPpeakAnno::findOverlapsOfPeaks (ABC_LPS_all_unique,
                                                               ABC_DexLPS_all_unique,
                                                               connectedPeaks = "keepAll",
                                                               ignore.strand = TRUE)
  # we coerce the info of overlapping regions into a dataframe
  overlap_ABC_all_unique_df <- as.data.frame(overlap_ABC_all_unique$overlappingPeaks)

  # and clean up column names
  colnames(overlap_ABC_all_unique_df)<- gsub("ABC_LPS_all_unique...ABC_DexLPS_all_unique.","", colnames(overlap_ABC_all_unique_df))
  colnames(overlap_ABC_all_unique_df)<- gsub(".1", "", colnames(overlap_ABC_all_unique_df))
  colnames(overlap_ABC_all_unique_df)<- paste( c(rep("LPS_",27),rep("DexLPS_",27),rep("",2)), colnames(overlap_ABC_all_unique_df), sep="")

  # we use this overlap to define a new variable that holds the info of which regions of the conditions form a pair
  overlap_ABC_all_unique_df$pairID <- paste("pair", seq(1, nrow(overlap_ABC_all_unique_df)), sep="" )

  #-----------------for DexLPS-------------------
  # merge the pairID onto the original dataframes
  ABC_DexLPS_all_wpairID <- merge(ABC_DexLPS_all,
                                  overlap_ABC_all_unique_df[,c("DexLPS_name","pairID")],
                                  by.x="name",
                                  by.y="DexLPS_name",
                                  all.x=TRUE)
  # for those that don't correspond to a pair, we assign condition and name as pairID
  ABC_DexLPS_all_wpairID <- ABC_DexLPS_all_wpairID %>% 
    mutate(pairID = case_when(is.na(pairID) ~ paste("DexLPS",name, sep="_"),
                              TRUE ~ pairID) )
  # combination of pairID and gene they're assigned to forms the assignment variable.
  # we will use this later to plot values from the 2 conditions, that have the same assignment
  ABC_DexLPS_all_wpairID$assignment <- paste(ABC_DexLPS_all_wpairID$pairID,
                                             ABC_DexLPS_all_wpairID$TargetGene,
                                             sep="_")

  #-----------------for LPS-------------------
  # merge the pairID onto the original dataframes
  ABC_LPS_all_wpairID <- merge(ABC_LPS_all,
                               overlap_ABC_all_unique_df[,c("LPS_name","pairID")],
                               by.x="name",
                               by.y="LPS_name",
                               all.x=TRUE)
  # for those that don't correspond to a pair, we assign condition and name as pairID
  ABC_LPS_all_wpairID <- ABC_LPS_all_wpairID %>% 
    mutate(pairID = case_when(is.na(pairID) ~ paste("LPS",name, sep="_"),
                              TRUE ~ pairID) )

  ABC_LPS_all_wpairID$assignment <- paste(ABC_LPS_all_wpairID$pairID,
                                          ABC_LPS_all_wpairID$TargetGene,
                                          sep="_")

  merged_conditions <- merge(ABC_DexLPS_all_wpairID,
                                 ABC_LPS_all_wpairID,
                                 by="assignment", 
                                 all=TRUE,
                                 suffixes = c(".DexLPS", ".LPS"))
  merged_conditions <- merged_conditions %>%
    mutate(TargetGene = case_when(TargetGene.DexLPS==TargetGene.LPS ~ TargetGene.DexLPS,
                                  is.na(TargetGene.DexLPS) ~ TargetGene.LPS,
                                  is.na(TargetGene.LPS) ~ TargetGene.DexLPS,
                                  TRUE ~ NA_character_)
    )

  return(merged_conditions)
}

merged_conditions_ABC <- merge_LPSandDexLPS_ABCdata(LPS_GR,DexLPS_GR)

#----------------------------------------------------------------------
# ------ merge on gene expression results
#----------------------------------------------------------------------

# merge the RNAseq results to use as colour later
merged_conditions_ABC <- merge(merged_conditions_ABC, contrast_DexLPSvLPS, 
                               by.x="TargetGene", by.y="EnsemblID")
merged_conditions_ABC <- merged_conditions_ABC %>%
  mutate(change=case_when(padj<0.05 & log2FoldChange>0.58 ~ "up",
                          padj<0.05 & log2FoldChange<(-0.58) ~ "down",
                          TRUE ~ "no change"))

# replace the na with 0
merged_conditions_ABCnomissing <- merged_conditions_ABC %>% 
  mutate(ABC.Score.DexLPS = case_when(is.na(ABC.Score.DexLPS) ~ 0,
                                      TRUE ~ ABC.Score.DexLPS) ) %>%
  mutate(ABC.Score.LPS = case_when(is.na(ABC.Score.LPS) ~ 0,
                                   TRUE ~ ABC.Score.LPS) )

cor_ABCscores <- cor(merged_conditions_ABCnomissing$ABC.Score.DexLPS,
                     merged_conditions_ABCnomissing$ABC.Score.LPS)

gg_merged_conditions_ABCnomissing <- ggplot(merged_conditions_ABCnomissing)+
  geom_point(aes(x=ABC.Score.DexLPS, y=ABC.Score.LPS, fill=log2FoldChange), alpha=0.2, size=1, stroke = 0, shape=21)+
  scale_fill_gradient(low="blue", high="red")+
  ylim(c(0,0.8))+
  xlim(c(0,0.8))+
  geom_text(x=0.2, y=0.75, label=paste("r = ",format(cor_ABCscores, digits = 2)))+
  labs(fill="log2FC", x="ABC score Dex+LPS", y="ABC score LPS")+
  theme(legend.position = "top")

gg_merged_conditions_ABCnomissing

#----------------------------------------------------------------------
# ------ plot foldchange in ABC score vs foldchange in expression
#----------------------------------------------------------------------

cor_ABCdiff_log2FC <- with(merged_conditions_ABCnomissing %>% filter(change!="no change"),
                           cor((ABC.Score.DexLPS - ABC.Score.LPS),log2FoldChange))

gg_ABCdiff_log2FC <- ggplot(merged_conditions_ABCnomissing %>% filter(change!="no change"), aes(x=(ABC.Score.DexLPS - ABC.Score.LPS), y=log2FoldChange)) +
  geom_hex(bins=70)+
  viridis::scale_fill_viridis()+
  geom_smooth(method = "lm", colour="black")+
  geom_text(x=0.6, y=5, label=paste("r = ",format(cor_ABCdiff_log2FC, digits = 2)))+
  labs(x="ABC score Dex+LPS - ABC score LPS", y="expression log2FC(Dex+LPS / LPS)")+
  theme(legend.position = "top")



#----------------------------------------------------------------------
# ------ ABC score by peak presence
#----------------------------------------------------------------------

plot_ABC_with_marginals <- function(whatchange){
  gg <- ggplot(data= merged_conditions_ABCnomissing %>% filter(change==whatchange) , 
               aes(x=ABC.Score.DexLPS, y=ABC.Score.LPS ) )+
    geom_point( aes( fill=whatchange) , size=1, alpha=1,stroke = 0, shape=21) +
    geom_abline(slope = 1)+
    ylim(c(0,0.8))+
    xlim(c(0,0.8))+
    labs(x="ABC score Dex+LPS", y="ABC score LPS")+
    scale_fill_manual(
      breaks=c("up","no change","down"),
      values=c("red","black","blue") )+
    theme(legend.position = "none")

  gg_marg <- ggExtra::ggMarginal(gg, type="histogram", size=2,
                                 xparams = list(bins=85), yparams = list(bins=85))

  return(gg_marg)
}

ggplot_ABC_with_marginals_up <- plot_ABC_with_marginals("up")
ggplot_ABC_with_marginals_up
ggplot_ABC_with_marginals_down <-plot_ABC_with_marginals("down")
ggplot_ABC_with_marginals_down

#----------------------------------------------------------------------
# ------ show marginals as bar plot
#----------------------------------------------------------------------
gg_marginals_dexlps <- 
  ggplot()+
  geom_histogram(data=merged_conditions_ABCnomissing %>% filter(change=="up"), aes(x=ABC.Score.DexLPS, fill="up"), alpha=0.2, bins=85)+
  geom_histogram(data=merged_conditions_ABCnomissing %>% filter(change=="down"), aes(x=ABC.Score.DexLPS, fill="down"), alpha=0.2, bins=85)+
  scale_fill_manual(
    breaks=c("up","down"),
    values=c("red","blue") )+
  xlim(c(NA,0.3))+
  labs(fill="gene expression change",x="ABC score DexLPS", y="ABC score LPS")+
  theme(legend.position = c(0.7,0.7))
gg_marginals_lps <- 
  ggplot()+
  geom_histogram(data=merged_conditions_ABCnomissing %>% filter(change=="up"), aes(x=ABC.Score.LPS, fill="up"), alpha=0.2, bins=85)+
  geom_histogram(data=merged_conditions_ABCnomissing %>% filter(change=="down"), aes(x=ABC.Score.LPS, fill="down"), alpha=0.2, bins=85)+
  scale_fill_manual(
    breaks=c("up","down"),
    values=c("red","blue") )+ 
  xlim(c(NA,0.3))+
  labs(fill="gene expression change",x="ABC score DexLPS", y="ABC score LPS")+
  theme(legend.position = c(0.7,0.7))

#----------------------------------------------------------------------
# ------ ABC score by peak presence
#----------------------------------------------------------------------
# Is there a difference in ABC score between the enhancers overlapping with a GR summit vs the ones that don't?


DexLPS_GR_summits_overlap <- IRanges::findOverlaps(DexLPS_GR,chipseq_summits) #(query,subject)
DexLPS_GR$haspeakID <- FALSE
DexLPS_GR$haspeakID[queryHits(DexLPS_GR_summits_overlap)] <- TRUE


plot_ABCscore_ofDEgenes_byGRoverlap <- function(){
  DE_ENSEMBL <- contrast_DexLPSvLPS %>%  filter(padj<0.05 & abs(log2FoldChange)>0.58 ) %>% pull(EnsemblID)

  DexLPS_enhancers_DEsubset <- 
    as.data.frame(DexLPS_GR) %>% 
    filter(TargetGene %in% DE_ENSEMBL) %>%
    filter(!class=="promoter")

  DexLPS_enhancers_DEsubset %>% filter(haspeakID==TRUE) %>% dplyr::pull(ABC.Score) %>% mean() %>% print()
  DexLPS_enhancers_DEsubset %>% filter(haspeakID==FALSE) %>% dplyr::pull(ABC.Score) %>% mean() %>% print()

  # show distribution for the two groups

  gg1 <- ggplot()+
    geom_density(data=DexLPS_enhancers_DEsubset %>% filter(haspeakID==TRUE), 
                 aes(x=ABC.Score, colour="haspeakID", fill="haspeakID"), alpha=0.3)+
    geom_density(data=DexLPS_enhancers_DEsubset %>% filter(haspeakID==FALSE), 
                 aes(x=ABC.Score, colour="nopeakID", fill="nopeakID"), alpha=0.3)+
    scale_x_continuous(trans = "log2")+
    scale_colour_manual("",
                        labels = c("has GR peak", "no GR peak"),
                        breaks = c("haspeakID", "nopeakID"),
                        values = c("darkmagenta", "cadetblue"))+
    scale_fill_manual("",
                      labels = c("has GR peak", "no GR peak"),
                      breaks = c("haspeakID", "nopeakID"),
                      values = c("darkmagenta", "cadetblue"))+
    guides(colour="none")+
    #xlim(c(-3,0))+
    labs(title=" ",
         x="ABC score Dex+LPS",
         y="density")+
    theme(
      axis.text = element_text(size=12, family="ArialMT", colour="black"),
      axis.title = element_text(size=12, family="ArialMT", colour="black"),
      axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
      legend.position = c(0.8,0.8)
    )
  plot(gg1)

  gg2 <- DexLPS_enhancers_DEsubset %>%
    ggplot( aes(x=hic_contact_pl_scaled_adj, y=log2(activity_base))) +
    geom_hex()+
    facet_wrap(~haspeakID, labeller="label_both")+
    viridis::scale_fill_viridis()+
    labs(title = " ", x="Hi-C contact", y="log2(base activity)")

  plot(gg2)
  all_plots=list()
  all_plots[[1]] <- gg1
  all_plots[[2]] <- gg2
  return(all_plots)
}

gg_ABCscore_ofDEgenes_byGRoverlap <- plot_ABCscore_ofDEgenes_byGRoverlap()


#----------------------------------------------------------------------
# ------ load IGV plot
#----------------------------------------------------------------------
igv <- png::readPNG( opt$igv )

gg_igv <- ggplot() + 
  ggpubr::background_image(igv) +
  # so it doesn't get squished
  #coord_fixed()+
  # This ensures that the image leaves some space at the edges
  theme(plot.margin = margin(t=0, l=0, r=0, b=0, unit = "cm"),
        axis.line = element_blank())


#----------------------------------------------------------------------
# ------ put figure panel together
#----------------------------------------------------------------------

# save giant files separately
ggsave(here("./results/current/Figures/Figure_abcresults_3B.bmp"), gg_merged_conditions_ABCnomissing,
       width=75, height=100, units="mm",
       bg="white")
ggsave(here("./results/current/Figures/Figure_abcresults_3B.png"), gg_merged_conditions_ABCnomissing,
       width=75, height=100, units="mm",
       bg="white")

gg_placeholder <- ggplot() + 
  theme(plot.margin = margin(t=0, l=0, r=0, b=0, unit = "cm"),
        axis.line = element_blank())

# first row
gg_r1c1 <-  ggpubr::ggarrange(gg_enh_per_gene, gg_genes_per_enhancer_zoom,
                            labels = c("A", NA),
                            ncol = 1, nrow = 2, heights = c(1,1))

gg_r1c2 <-  ggpubr::ggarrange(gg_placeholder, ggplot_ABC_with_marginals_up, ggplot_ABC_with_marginals_down, 
                            labels = c("B", "C", NA),
                            ncol = 3, nrow=1, widths=c(1,1,1))

gg_r1 <- ggpubr::ggarrange(gg_r1c1, gg_r1c2, 
                           labels = c(NA, NA),
                           ncol = 2, nrow=1, widths=c(1,3))

# second row
gg_r2 <-  ggpubr::ggarrange(gg_ABCdiff_log2FC, gg_ABCscore_ofDEgenes_byGRoverlap[[1]], gg_ABCscore_ofDEgenes_byGRoverlap[[2]], 
                            labels = c("D","E", "F"),
                            ncol = 3, nrow=1, widths=c(1,1,1.5))


gg_tophalf <- ggpubr::ggarrange(gg_r1, gg_r2, 
                                nrow=2, heights =c(1,1))
gg_tophalf

full_panel <- ggpubr::ggarrange(gg_tophalf, gg_igv, 
                                labels=c(NA,"G"),
                                nrow=2, heights = c(1.8,1))

#full_panel
ggsave(here("./results/current/Figures/Figure_abcresults.png"), full_panel,
       width=300, height=300, units="mm",
       bg="white")
ggsave(here("./results/current/Figures/Figure_abcresults.pdf"), full_panel,
       width=300, height=300, units="mm",
       bg="white")

R ggplot2 dplyr org.Mm.eg.db TxDb.Mmusculus.UCSC.mm10.knownGene optparse plyr here ComplexHeatmap topGO From line 1 of scripts/abc_visualizations.r

suppressPackageStartupMessages(library(optparse, warn.conflicts=F, quietly=T))

option_list <- list(
  make_option(c("--ctss_pool1"), 
              type="character",
              help="Path for ctss file from FANTOM5 of pool 1"),
  make_option(c("--ctss_pool2"), 
              type="character",
              help="Path for ctss file from FANTOM5 of pool 2"),
  make_option(c("--liftoverchain"), 
              type="character",
              help="Path to liftover chain file for mm9 to mm10"),
  make_option(c("--gencode_mm9_geneanno"), 
              type="character",
              help="Path to genomic reference file for mm9"),
  make_option(c("--gencode_mm10_geneanno"), 
              type="character",
              help="GENCODE genomic reference for assembly mm10, prefiltered for gene entries"),
  make_option(c("--outdir"), 
              type="character",
              help="Output directory")
)
opt <- parse_args(OptionParser(option_list=option_list))

suppressPackageStartupMessages(library(dplyr, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(BSgenome.Mmusculus.UCSC.mm9, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(ChIPseeker, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(CAGEr, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(rtracklayer, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(ggplot2, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(GenomicRanges, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(here, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(org.Mm.eg.db, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(TxDb.Mmusculus.UCSC.mm10.knownGene, warn.conflicts=F, quietly=T))

outdir <- here(opt$outdir)

# Workflow is based on the CAGEr vignette
# https://www.bioconductor.org/packages/release/bioc/vignettes/CAGEr/inst/doc/CAGEexp.html


#----------------------   Import CAGE samples from BMDMs
#------------------------------------------------------------------------------------

# get URL for public samples, download and read necessariy columns into ctss file (see snakemake workflow)
# We want to import bone-marrow derived macrophage samples through CAGEr.
# After looking at list of available datasets we can decide what samples best fit our needs.
# Let's see what samples are available through FANTOM5
#data(FANTOM5mouseSamples)
#head(FANTOM5mouseSamples)
# The FANTOM5 dataframe holds descriptions of the samples and the url where they can be retrieved.
# There's an easy way to import samples that match a certain term into a CAGEset object.
#mac_samples <- FANTOM5mouseSamples[grep("macrophage, bone marrow derived",
#                                        FANTOM5mouseSamples[,"description"]),]

print("NOTE: reference genome for the public CAGE files is mm9!")

ce <- CAGEr::CAGEexp( genomeName = "BSgenome.Mmusculus.UCSC.mm9",
                      inputFiles = c(opt$ctss_pool1 ,opt$ctss_pool2),
                      inputFilesType = "ctss",
                      sampleLabels   = c("pool1","pool2")
)
# To actually read in the data into the object we use getCTSS() function, that will add an experiment called tagCountMatrix to the CAGEexp object.
ce <- CAGEr::getCTSS(ce)
ce

#------------------------------------------------------------------------------------
#----------------------   QC
#------------------------------------------------------------------------------------

ncbim37_anno <- rtracklayer::import.gff(opt$gencode_mm9_geneanno)

ce <- annotateCTSS(ce, ncbim37_anno)
colData(ce)[,c("librarySizes", "promoter", "exon", "intron", "unknown")]
plotAnnot(ce, "counts")

corr.m <- plotCorrelation2( ce, samples = "all"
                            , tagCountThreshold = 1, applyThresholdBoth = FALSE
                            , method = "pearson")

#------------------------------------------------------------------------------------
#----------------------   Get read clusters
#------------------------------------------------------------------------------------

print("Merging samples")
#Now we can merge them
ce <- mergeSamples(ce, mergeIndex = c(1,1), 
                   mergedSampleLabels = c("BMDM"))

# redo annotation since this gets reset during merging
ce <- annotateCTSS(ce, ncbim37_anno)

print("The total library size is:")
print(librarySizes(ce))

# Check if data follows a power law distribution
plotReverseCumulatives(ce, fitInRange = c(5, 3000), onePlot = TRUE)

print("Normalizing reads")
# Since we don't really care about making comparisons between different population we could prob just skip the normalization
# The fit range is chosen from the plot. We take the alpha from the ref distribution and set T to a million to get the tag count per million (TPM)
ce <- normalizeTagCount(ce, method = "powerLaw", fitInRange = c(5, 3000), alpha = 1.15, T = 1*10^6)
#mac_CAGEset@tagCountMatrix

print("Cluster the tags")
# After normalization we can cluster the tags.
# Clustering, only seems to work with the CAGEset object (due to some problems with the IRanges column)
# From the CAGEr vignette:
# "Transcription start sites are found in the promoter region of a gene and reflect the transcriptional activity of that promoter (Figure 5). TSSs in the close proximity of each other give rise to a functionally equivalent set of transcripts and are likely regulated by the same promoter elements. Thus, TSSs can be spatially clustered into larger transcriptional units, called tag clusters (TCs) that correspond to individual promoters. CAGEr supports three methods for spatial clustering of TSSs along the genome, two ab initio methods driven by the data itself, as well as assigning TSSs to predefined genomic regions:"

ce <- clusterCTSS(ce,
           threshold=1,
           thresholdIsTpm = TRUE,
           nrPassThreshold = 1,
           method="distclu",
           maxDist=20,
           removeSingletons = TRUE,
           keepSingletonsAbove = 3)

# Let's have a look what the result looks like
head(tagClustersGR(ce, sample = "BMDM"))

# calculate cumulative distribution for every tag cluster in each of the samples
ce <- cumulativeCTSSdistribution(ce, clusters = "tagClusters", useMulticore = T)
# determine the positions of selected quantiles
ce <- quantilePositions(ce, clusters = "tagClusters", qLow = 0.1, qUp = 0.9)

# How many tagclusters do we have in total?
length(tagClustersGR(ce, sample = "BMDM"))

# histogram of interquantile width
plotInterquantileWidth(ce, clusters = "tagClusters", tpmThreshold = 3, qLow = 0.1, qUp = 0.9)

print("Retrieving clusters as GenomicRanges")
clusters_gr <- tagClustersGR(ce, sample="BMDM")

#------------------------------------------------------------------------------------
#----------------------   Liftover coordinates to mm10
#------------------------------------------------------------------------------------

print("Liftover to mm10")

# * Now we can lift over the intervals to mm10
# * Annotate them with peakanno
# * pick the most highly expressed one for each gene

liftover <- function(peaks_gr_mm9){ #input is a GenomicRanges object in mm9 coordinates
  #lift peak locations from mm9 to mm10
  chain <- rtracklayer::import.chain(opt$liftoverchain)
  on.exit( close( file(opt$liftoverchain)) )

  peaks_gr_mm10 <- rtracklayer::liftOver(peaks_gr_mm9, chain)
  peaks_gr_mm10 <- GenomicRanges::GRanges(unlist(peaks_gr_mm10))

  return(peaks_gr_mm10)
}

mac_cage_mm10 <- liftover( clusters_gr )
ggplot(as.data.frame(mac_cage_mm10), aes(x=width)) +
  geom_histogram(bins = 100)

# Liftover coordinates of dominant_ctss
dominant_ctss <- liftover( 
  GRanges(
    seqnames = seqnames(clusters_gr), 
    ranges = IRanges(start = clusters_gr$dominant_ctss,
                     end = clusters_gr$dominant_ctss),
    score=clusters_gr$score)
)

#------------------------------------------------------------------------------------
#-----------------   Annotate TSS clusters to reference gene coordinates
#------------------------------------------------------------------------------------
print("Annotate TSS coordinates")
# Use coordinates of the dominant ctss downstream

mac_cage_anno <- ChIPseeker::annotatePeak(dominant_ctss, 
                                          tssRegion=c(-1000, 1000), #more stringent than default
                                          level = "gene",
                                          TxDb=TxDb.Mmusculus.UCSC.mm10.knownGene, 
                                          annoDb = "org.Mm.eg.db")

# For those that are reasonably close to a TSS,
# check for each gene, which position has the highest score.

mac_cage_maxscore <- as.data.frame(mac_cage_anno) %>% 
  filter(abs(distanceToTSS)<=30000) %>%
  mutate(SYMBOL=as.factor(SYMBOL)) %>%
  filter(SYMBOL!="") %>%
  group_by(SYMBOL)%>%
  filter(score == max(score))%>%
  filter(distanceToTSS == min(distanceToTSS )) # for tied score, use shorter distance

nrow(mac_cage_maxscore)

ggplot(mac_cage_maxscore, aes(x=distanceToTSS)) +
  geom_histogram(bins = 100)

#------------------------------------------------------------------------------------
#----------   retrieve gene coordinates and promoterregion from reference
#------------------------------------------------------------------------------------

gencode_mm10_geneanno <- rtracklayer::import.gff(opt$gencode_mm10_geneanno)
genecoords <- as.data.frame(gencode_mm10_geneanno) %>%
  dplyr::select("seqnames","start","end","strand","gene_id") %>%
  mutate(score=0) %>%
  dplyr::mutate(gene_id=gsub("\\.[0-9]*$","",gene_id)) %>%
  dplyr::filter(!"gene_id"=="")%>%
  dplyr::select("seqnames","start","end","gene_id","score","strand")


gencode_mm10_promoterregions <- promoters(gencode_mm10_geneanno)
gencode_mm10_promoterregions <- as.data.frame(gencode_mm10_promoterregions) %>%
  dplyr::select("seqnames","start","end","strand","gene_id") %>%
  mutate(score=0) %>%
  dplyr::mutate(gene_id=gsub("\\.[0-9]*$","",gene_id)) %>%
  dplyr::filter(!"gene_id"=="")%>%
  dplyr::select("seqnames","start","end","gene_id","score","strand")

#------------------------------------------------------------------------------------
#-----------------   export files
#------------------------------------------------------------------------------------

write.table(genecoords, 
            file = paste0(outdir,"reference_genecoords.bed"),
            sep="\t", 
            col.names = FALSE,
            quote=FALSE,
            row.names = FALSE)

write.table(gencode_mm10_promoterregions, 
            file = paste0(outdir,"reference_promoterregions.bed"),
            sep="\t", 
            col.names = FALSE,
            quote=FALSE,
            row.names = FALSE)


rtracklayer::export.bed(as.data.frame(mac_cage_mm10),
                        paste0(outdir, "mac_cage_tssclusterregions.bed"),
                        format="bed")

rtracklayer::export.bed(as.data.frame(dominant_ctss),
                        paste0(outdir, "mac_cage_dominant_ctss.bed"),
                        format="bed")

rtracklayer::export.bed(mac_cage_maxscore,
                        paste0(outdir,"mac_cage_maxscore.bed"),
                        format="bed")

R Snakemake ggplot2 dplyr org.Mm.eg.db TxDb.Mmusculus.UCSC.mm10.knownGene BSgenome.Mmusculus.UCSC.mm9 optparse GenomicRanges here ChIPseeker CAGEr From line 1 of scripts/get_TSS_from_cage.r

suppressPackageStartupMessages(library(optparse, warn.conflicts=F, quietly=T))

option_list <- list(
  make_option(c( "--contrast_DexLPSvLPS"),
              type="character",
              help="Path to annotated tsv file of DeSeq2 contrast of DexLPS vs LPS"),
  make_option(c("--assignment_summit_prox"),
              type="character",
              help="Path to rds file of proximity based assignment of peak summits to genes"),
  make_option(c("--assignment_summits_abcregion_dexlps"),
              type="character",
              help="Path to rds file of assignment of peak summits within abcregions to genes (in DexLPS condition)"),
  make_option(c("--assignment_summits_abcregion_lps"),
              type="character",
              help="Path to rds file of assignment of peak summits within abcregions to genes (in LPS condition)"),
  make_option(c("--assignment_abcregion_dexlps"),
              type="character",
              help="Path to rds file of assignment of abcregions to genes (in DexLPS condition)"),
  make_option(c( "--assignment_abcregion_lps"),
              type="character",
              help="Path to rds file of assignment of abcregions to genes (in LPS condition)"),
  make_option(c("--motifcounts_summitregion"),
              type="character",
              help="Path to rds file of fimo motifcounts within summitregions"),
  make_option(c("--motifcounts_abcregion_dexlps"),
              type="character",
              help="Path to rds file of fimo motifcounts within ABC regions (in DexLPS condition)"),
  make_option(c("--motifcounts_abcregion_lps"),
              type="character",
              help="Path to rds file of fimo motifcounts within ABC regions (in LPS condition)"),
  make_option(c("--model_coefs_joint"),
              type="character",
              help="Path to rds file with model coefficients of joint models"),
  make_option(c("--model_coefs_sep"),
              type="character",
              help="Path to rds file with model coefficients of models tha include enhancers and promoters separately"),
  make_option(c( "--featuredir"),
              type="character",
              help="Path to directory with unscales featurematrizes"),
  make_option(c( "--outfile"),
              type="character",
              help="Path to output file with metrics")
)

opt <- parse_args(OptionParser(option_list=option_list))

dir.create(opt$featuredir)

#change default for stringAsFactors
options(stringsAsFactors = FALSE)

suppressPackageStartupMessages(library(here, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(ggplot2, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(plyranges, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(org.Mm.eg.db, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(DESeq2, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(ComplexHeatmap, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(dplyr, warn.conflicts=F, quietly=T))

#set defaults for ggplot2 figures
theme_update(panel.background = element_rect(fill = "transparent", colour = NA),
             plot.background = element_rect(fill = "transparent", colour = NA),
             legend.background = element_rect(fill = "transparent", colour = NA),
             legend.key = element_rect(fill = "transparent", colour = NA),
             text=element_text(size=6, family = "ArialMT", colour="black"),
             title=element_text(size=8, family="ArialMT", colour="black"),
             panel.grid.major = element_line(colour="grey", size=0.2),
             panel.grid.minor = element_blank(),
             axis.text = element_text(size=6, family="ArialMT", colour="black"),
             axis.line = element_line(colour="black"),
             axis.ticks = element_line(colour="black"),
             legend.key.size = unit(6, 'points'), #change legend key size
             legend.key.height = unit(6, 'points'), #change legend key height
             legend.key.width = unit(6, 'points'), #change legend key width
             legend.text = element_text(size=6, family="ArialMT", colour="black"))

set.seed(12345)

#-------------------------------
## read in  data
#-------------------------------

contrast_DexLPSvLPS <- read.delim(opt$contrast_DexLPSvLPS)

for (optname in names(opt)[2:11]){ #except for the featuredir
  print(paste0("Loading ", optname))
  assign(optname, readRDS( opt[[optname]] ))
}

#--------------------------------------------
## ---- function definitions
#--------------------------------------------

coerce_coef2df <- function(model_coef){
  model_coef <- as.matrix(model_coef)
  model_coef <- as.data.frame(model_coef)
  model_coef$names <- rownames(model_coef)
  colnames(model_coef) <- c("estimates", "names")
  return(model_coef)
}

#------------function will------------:
# * merge the motifdata with gene assignments and 
# * then merge the gene expression changes,
# * filter for DE genes and aggregate per gene
#--------------------------------------
merge_motifdata_with_assignments <- function( motifcounts, assignments, contrast, 
                                              maxonly=FALSE, excludepromoters=FALSE, weightby=FALSE, sepPromEnh=FALSE){

  # check if we should use all abc assignments, or only the max one of each peakID
  if(maxonly==TRUE){
    assignments <- assignments %>% group_by(name) %>% filter(abcscore==max(abcscore)) %>% distinct()
  } else{
    assignments <- assignments
  }

  if(excludepromoters=="all"){
    assignments <- assignments %>% filter(!class=="promoter")
  } else if (excludepromoters=="onlyNONself"){
    assignments <- assignments %>% filter(!c(class=="promoter" & isSelfPromoter=="False"))
  } else{
    assignments <- assignments
  }

  motifdf <- merge(motifcounts, assignments, 
                   by.x="name", by.y="name")

  motifdf <- motifdf %>% relocate(c(anno))

  # merge the expression change
  # use mgi_symbol for prox based, otherwise ensemblID
  # We DONT set all.x=TRUE because we don't care about predicting gene that aren't even expressed or that we don't have a clear label for
  if(maxonly=="prox"){
    motifdf <- merge(motifdf, contrast, 
                     by.x="anno", by.y="mgi_symbol")
  } else {
    motifdf <- merge(motifdf, contrast, 
                     by.x="anno", by.y="Row.names")
  }

  #recode the logFC and padj into a label (optionally through command line arguments)
  motifdf <- motifdf %>% 
    mutate(label=case_when(log2FoldChange>0.58 & padj < 0.05 ~ "up",
                           log2FoldChange<(-0.58) & padj < 0.05 ~ "down",
                           TRUE ~ "no_change")) %>%
    filter(label!="no_change") %>%
    mutate(label=factor(label,
                        levels=c("down","up"),
                        labels=c(0,1))) %>%
    relocate(label)

  # chr 2, 3 and 4 (20%) were used as the tuning set for hyperparameter tuning. 
  # Regions from chromosomes 1, 8 and 9 (20%) were used as the test set for performance evaluation 
  # The remaining regions were used for model training.

  #------aggregate over gene SYMBOL
  if ("abcscore" %in% colnames(motifdf)) { # loop for ABC based assignments

    if (weightby=="abcscore"){
      unselect_col <- "abcnumerator"
    } else if(weightby=="abcnumerator") {
      unselect_col <- "abcscore"
    } else{
      unselect_col <- c("abcscore","abcnumerator")
    }

    if (sepPromEnh==TRUE){
      motifdf_aggr <-
        motifdf %>% 
        dplyr::select(!c(unselect_col,"name","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj","gene_biotype","mgi_symbol")) %>%
        { if(weightby!=FALSE) mutate(.,across(where(is.numeric), ~ (.x * get(weightby)))) else .} %>% # weight features by score
        dplyr::select(!any_of(as.character(weightby))) %>% # then we can drop the score since it will be nonsensical after the aggregation anyways
        group_by(label,seqnames,anno,class) %>%
        dplyr::summarise(across( where(is.numeric), .fns=sum )) %>% # sum up genewise feature counts
        ungroup()

      # From here we need to cast the motifcounts for the promoterregions, so that in the end we have one row per gene (instead of 1-2)
      motifdf_aggr <- motifdf_aggr %>% tidyr::pivot_wider(id_cols=c(label,seqnames,anno), 
                                                          names_from=class, 
                                                          values_from = !c(label,seqnames,anno, class),
                                                          values_fill = 0)
    } else {
      motifdf_aggr <-
        motifdf %>% 
        dplyr::select(!c(unselect_col,"name","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj","gene_biotype","mgi_symbol")) %>%
        { if(weightby!=FALSE) mutate(., across(where(is.numeric), ~ (.x * get(weightby)))) else .} %>% # weight features by score
        dplyr::select(!any_of(as.character(weightby))) %>% # then we can drop the score since it will be nonsensical after the aggregation anyways
        group_by(label,seqnames,anno) %>%
        dplyr::summarise(across( where(is.numeric), .fns=sum )) %>% # sum up genewise feature counts
        ungroup()
    }


  } else { # loop for prox based assignments
    motifdf_aggr <-
      motifdf %>% 
      dplyr::select(!c("name","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj","gene_biotype")) %>% 
      group_by(label,seqnames,anno) %>%
      dplyr::summarise(across( where(is.numeric), .fns=sum )) %>% # sum up genewise feature counts
      ungroup()
  }

  return(motifdf_aggr)

}

get_feature_and_label_metrics <- function(motifdf, trainvalidx, genenames){

  targets_train <- motifdf[ trainvalidx, ] %>% pull(label) %>% as.numeric(levels(.))[.] %>% as.matrix()
  targets_test <- motifdf[ -trainvalidx, ] %>% pull(label) %>% as.numeric(levels(.))[.] %>% as.matrix()

  metrics = data.frame(
    n_input_features = ncol( motifdf[ , -c(1,2,3)]),# first three columns are labels, chromosome and gene annotation
    n_neg_inst_train = table(targets_train)[['0']],
    n_pos_inst_train = table(targets_train)[['1']],
    n_neg_inst_test = table(targets_test)[['0']],
    n_pos_inst_test = table(targets_test)[['1']]
  )

  return(metrics)
}

#------------------------------------------------
## initialize object to gather all metrics
#------------------------------------------------
all_metrics=data.frame(
  n_input_features=numeric(),
  n_neg_inst_train=numeric(),
  n_pos_inst_train=numeric(),
  n_neg_inst_test=numeric(),
  n_pos_inst_test=numeric()
)
#------------------------------------------------
## run proximity based
#------------------------------------------------

#---------------------- 
# This is independent of the ABC results (no need to loop through different assignment variations)

print("Getting metrics of prox-based model")
my_rdsfile <- paste0(opt$featuredir,"prox.rds")
if(!file.exists(my_rdsfile)){

  motifdata_aggr_prox <- merge_motifdata_with_assignments(motifcounts_summitregion,
                                                     assignment_summit_prox,
                                                     contrast_DexLPSvLPS,
                                                     maxonly="prox",
                                                     excludepromoters=FALSE,
                                                     weightby = FALSE,
                                                     sepPromEnh = FALSE)

  saveRDS(motifdata_aggr_prox,file=my_rdsfile)
} else{
  motifdata_aggr_prox <- readRDS(my_rdsfile)
}

motifdata_aggr_prox_scaled <- motifdata_aggr_prox %>% mutate(., across(where(is.numeric), ~(scale(.) %>% as.vector)))
motifdata_aggr_prox_tranval_idx <- motifdata_aggr_prox_scaled %>% with(which(seqnames!="chr1" & seqnames!="chr8" & seqnames!="chr9"))
metrics_prox <- get_feature_and_label_metrics (motifdata_aggr_prox_scaled, motifdata_aggr_prox_tranval_idx, genenames=genenames)

all_metrics <- rbind(all_metrics,
                     "prox"=metrics_prox)


#---------------------- 

not_all_na <- function(x) any(!is.na(x))

for (motifdata in c("motifcounts_abcregion","motifcounts_summitregion")){
  for(excludepromoters in c(FALSE,"all","onlyNONself")){
    for (onlymax in c(TRUE,FALSE)){
      for (sepPromEnh in c(TRUE,FALSE)){
        for (weight in c(FALSE, "abcscore")){

          # set assignments fitting for the input data
          if(motifdata=="motifcounts_abcregion"){
            assignment_dexlps <- assignment_abcregion_dexlps
            assignment_lps <- assignment_abcregion_lps
            motifcounts_dexlps <- motifcounts_abcregion_dexlps
            motifcounts_lps <- motifcounts_abcregion_lps
          } else if (motifdata=="motifcounts_summitregion" ){
            # in this case the motifcounts for the 2 conditions are the same, but there assignments differ
            assignment_dexlps <- assignment_summits_abcregion_dexlps
            assignment_lps <- assignment_summits_abcregion_lps
            motifcounts_dexlps <- motifcounts_summitregion
            motifcounts_lps <- motifcounts_summitregion
          } else {break}

          #-----------------------DEXLPS------------------------------
          modelname <- paste(motifdata,"condition_dexlps_exclprom",excludepromoters,"onlymax",onlymax,"sepPromEnh",sepPromEnh,"weight",weight, sep="_")
          print(modelname)

          my_rdsfile <- here( paste0(opt$featuredir, modelname,".rds"))
          if(!file.exists(my_rdsfile)){
            featurematrix_dexlps <- merge_motifdata_with_assignments(motifcounts_dexlps,
                                                                   assignment_dexlps,
                                                                   contrast_DexLPSvLPS,
                                                                   weightby=weight, 
                                                                   maxonly=onlymax, 
                                                                   excludepromoters=excludepromoters, 
                                                                   sepPromEnh=sepPromEnh)

            saveRDS(featurematrix_dexlps, my_rdsfile)
          } else{
            featurematrix_dexlps <- readRDS(my_rdsfile)
          }
          # remove columns that are NA after scaling
          featurematrix_dexlps_scaled <- featurematrix_dexlps %>%
            mutate(., across(where(is.numeric), ~(scale(.) %>% as.vector))) %>%
            dplyr::select(where(not_all_na))
          motifdata_aggr_tranval_idx <- featurematrix_dexlps_scaled %>% with(which(seqnames!="chr1" & seqnames!="chr8" & seqnames!="chr9"))
          new_metric <- get_feature_and_label_metrics (featurematrix_dexlps_scaled, motifdata_aggr_tranval_idx, genenames=genenames)

          all_metrics <- rbind(all_metrics,
                               new_metric)%>%
            magrittr::set_rownames(c(rownames(all_metrics),modelname))

          #--------------------------LPS---------------------------
          modelname <- paste(motifdata,"condition_lps_exclprom",excludepromoters,"onlymax",onlymax,"sepPromEnh",sepPromEnh,"weight",weight, sep="_")
          print(modelname)

          my_rdsfile <- here(paste0(opt$featuredir, modelname,".rds"))
          if(!file.exists(my_rdsfile)){
            featurematrix_lps <- merge_motifdata_with_assignments(motifcounts_lps,
                                                                assignment_lps,
                                                                contrast_DexLPSvLPS,
                                                                weightby=weight, 
                                                                maxonly=onlymax, 
                                                                excludepromoters=excludepromoters, 
                                                                sepPromEnh=sepPromEnh)

            saveRDS(featurematrix_lps,file=my_rdsfile)
          } else{
            featurematrix_lps <- readRDS(my_rdsfile)
          }
          featurematrix_lps_scaled <- featurematrix_lps %>% 
            mutate(., across(where(is.numeric), ~(scale(.) %>% as.vector))) %>%
            dplyr::select(where(not_all_na))
          motifdata_aggr_tranval_idx <- featurematrix_lps_scaled %>% with(which(seqnames!="chr1" & seqnames!="chr8" & seqnames!="chr9"))
          new_metric <- get_feature_and_label_metrics (featurematrix_lps_scaled, motifdata_aggr_tranval_idx, genenames=genenames)

          all_metrics <- rbind(all_metrics,
                               new_metric)%>%
            magrittr::set_rownames(c(rownames(all_metrics),modelname))

          #-----------------------DIFFERENCE------------------------------

          modelname <- paste(motifdata,"condition_DexLPS-LPS_exclprom",excludepromoters,"onlymax",onlymax,"sepPromEnh",sepPromEnh,"weight",weight, sep="_")
          print(modelname)
          my_rdsfile <- here(paste0(opt$featuredir, modelname,".rds"))
          if(!file.exists(my_rdsfile)){
            # they contain the same motifs, but not the same genes.
            # doublecheck that all columnnames are identical
            table(colnames(featurematrix_dexlps) == colnames(featurematrix_lps))

            #  motifcounts missing in one condition should be set to 0 
            merged_featurematrix <- merge(featurematrix_dexlps,
                                        featurematrix_lps, 
                                        by=c("anno","label","seqnames"),
                                        all=TRUE)
            # replace missing counts in one of the conditions with 0
            merged_featurematrix[is.na(merged_featurematrix)] <- 0

            # use dataframe suffix to grab respective columns
            featurematrix_diff <- 
              merged_featurematrix[grep(".x$",colnames(merged_featurematrix))] - 
              merged_featurematrix[grep(".y$",colnames(merged_featurematrix))] 
            # tidy up column names
            colnames(featurematrix_diff) <- gsub(".x$","", colnames(featurematrix_diff))

            # add first 3 columns back after computing the difference of the counts
            featurematrix_diff <- cbind(merged_featurematrix[1:3],featurematrix_diff)

            saveRDS(featurematrix_diff,file=my_rdsfile)
          } else{
            featurematrix_diff <- readRDS(my_rdsfile)
          }
          featurematrix_diff_scaled <- featurematrix_diff %>% 
            mutate(., across(where(is.numeric), ~(scale(.) %>% as.vector))) %>%
            dplyr::select(where(not_all_na))
          # run GLM and look at performance  
          featurematrix_diff_tranval_idx <- featurematrix_diff_scaled %>% with(which(seqnames!="chr1" & seqnames!="chr8" & seqnames!="chr9"))
          new_metric <- get_feature_and_label_metrics (featurematrix_diff_scaled, featurematrix_diff_tranval_idx, genenames=genenames)

          all_metrics <- rbind(all_metrics,
                               new_metric)%>%
            magrittr::set_rownames(c(rownames(all_metrics),modelname))


        }
      }
    }
  }
}

all_metrics <- all_metrics %>%
  mutate(ratio_pos_inst_train = n_pos_inst_train/(n_pos_inst_train+n_neg_inst_train),
         ratio_pos_inst_test = n_pos_inst_test/(n_pos_inst_test+n_neg_inst_test)
         )

# let'S add the number of non-zero coefficients to this
model_coefs_joint <- readRDS( opt$model_coefs_joint )
model_coefs_sep <- readRDS( opt$model_coefs_sep )

nonzero_columentries <- rbind ( colSums(model_coefs_joint != 0, na.rm=TRUE) %>% as.data.frame(),
                            colSums(model_coefs_sep != 0, na.rm=TRUE) %>% as.data.frame() )
colnames(nonzero_columentries) <- "n_sel_features_inclintercept"

all_metrics <- merge(all_metrics, nonzero_columentries, by="row.names") %>% 
  relocate(n_sel_features_inclintercept, .after = n_input_features )


# parse feature engineering choices from modelname and add them as variables
all_metrics <- all_metrics %>%
  mutate(condition = case_when(grepl("condition_lps", Row.names) ~ "LPS",
                              grepl("condition_DexLPS-LPS", Row.names) ~ "Dex+LPS - LPS",
                              grepl("condition_dexlps", Row.names) ~ "Dex+LPS"),
         input_region = case_when(grepl("motifcounts_summitregion", Row.names) ~ "GR summitregions",
                                  grepl("motifcounts_abcregion", Row.names) ~ "active regions"),
         excludepromoters = case_when(grepl("exclprom_all", Row.names) ~ "all",
                                      grepl("exclprom_onlyNONself", Row.names) ~ "nonself",
                                      grepl("exclprom_FALSE", Row.names) ~ "none"),
         mapping = case_when(grepl("onlymax_FALSE", Row.names) ~ "1-to-many (ABC-based)",
                             grepl("onlymax_TRUE", Row.names) ~ "1-to-1 (ABC-based)"),
         weight = case_when(grepl("weight_abcscore", Row.names) ~ "abcscore",
                             grepl("weight_FALSE", Row.names) ~ "none"),
         prom_and_enh_features = case_when(grepl("sepPromEnh_TRUE", Row.names) ~ "separate",
                                      grepl("sepPromEnh_FALSE", Row.names) ~ "aggregate")

         )
# manually update choices for the reference model
all_metrics <- all_metrics %>%
  rows_update( tibble(Row.names = "prox", 
                      condition="Dex+LPS",
                      input_region="GR summitregions",
                      mapping="1-to-1 (proximity-based)",
                      weight="none"), 
               by = "Row.names")

summary(all_metrics)

summary(all_metrics$ratio_pos_inst_train-all_metrics$ratio_pos_inst_test)

write.table(all_metrics,
            file=opt$outfile,
            quote=FALSE, sep="\t", row.names = FALSE, col.names = TRUE )

R ggplot2 dplyr org.Mm.eg.db optparse here ComplexHeatmap FiMO plyranges From line 3 of scripts/quantify_features_and_label_distribution.r

Workflow dervied form https://github.com/snakemake-workflows/rna-seq-star-deseq2 for multi condition Deseq2 and enrichment analyis (v0.1.1)

knitr::opts_chunk$set(
  echo = FALSE, warning = FALSE, message = FALSE, dev = "png",
  fig.width = 12, fig.height = 12
)
library(purrr)
library(clusterProfiler)
library(magrittr)
library(ggplot2)
library(org.Mm.eg.db)
library(svglite)
if (exists("snakemake")) {
  gsea_list <- snakemake@input[["gsea_result"]]
  plot_path <- snakemake@params[["plot_path"]]
  c_groups <- snakemake@params[["contrast_groups"]]
  write(unlist(c_groups), file = stderr())
  names(gsea_list) <- names(x = c_groups)
  species_name <- snakemake@config[["organism"]]
  gene_name_type <- snakemake@config[["gene_name_type"]]
  gsea_conf <- snakemake@config[["gsea"]]
  save_plots <- TRUE
} else {
  conf <- yaml::read_yaml("../../../configs/VascAge_Apelin_config.yaml")
  BASE_DIR <- file.path(conf$dirs$BASE_ANALYSIS_DIR)
  cond_id <- names(conf$diffexp$contrasts)[1]
  gsea_list <- as.list(file.path(BASE_DIR, glue::glue("results/diffexp/{cond_id}/{names(conf$diffexp$contrasts[[cond_id]])}.gseares.RDS")))
  c_groups <- names(conf$diffexp$contrasts[[cond_id]])
  names(gsea_list) <- c_groups
  #  gsea_list <- gsea_list[1:3]
  plot_path <- "/home/heyer/tmp/"
  species_name <- "Mus musculus"
  gene_name_type <- "ENSEMBL"
  save_plots <- FALSE
  gsea_conf <- conf$gsea
}
limit <-  10

R Markdown Snakemake ggplot2 org.Mm.eg.db magrittr purrr clusterProfiler svglite From line 11 of RMD_scripts/gsea_report.Rmd

library(tibble)
if (!require(RNAscripts)) {
  devtools::install("./scripts/RNAscripts", upgrade = "never")
}
library(RNAscripts)
if (packageVersion("mitch") < "1.1.8") {
  devtools::install_github("markziemann/mitch")
}
# library(org.Mm.eg.db)
library(mitch)
library(tidyverse)
library(ensembldb)
library(AnnotationHub)
## Load the annotation resource.
ah <- AnnotationHub()
Gtf <- ah["AH28753"]
## Create a EnsDb database file from this.
DbFile <- ensDbFromAH(Gtf)
## We can either generate a database package, or directly load the data
edb <- EnsDb(DbFile)
library(msigdbr)
if (exists("snakemake")) {
  # Input Files
  diffexp_tables_paths <- snakemake@input[["tables"]]
  contrast_list <- snakemake@params[["contrasts"]]
  fpkm_path <- snakemake@input[["fpkm"]]
  # Output Files
  mitch_table <- snakemake@output[["mitch_table"]]
  mitch_rds <- snakemake@output[["mitch_rds"]]
  report_path <- snakemake@output[["mitch_report"]]
  # Parameters
  threads <- snakemake@threads
  enrichment_term_type <- "SYMBOL"
} else {
  BASE_ANALYSIS_DIR <- "/omics/odcf/analysis/OE0228_projects/VascularAging/rna_sequencing/cre_2022/"
  contrast_list <- c(
    "basal_cre_pos_vs_basal_cre_neg", "tumor_cre_pos_vs_tumor_cre_neg", "tumor_cre_pos_vs_basal_cre_pos",
    "tumor_cre_neg_vs_basal_cre_neg"
  )
  diffexp_tables_paths <- as.list(file.path(BASE_ANALYSIS_DIR, glue::glue("results/diffexp/{contrast_list}.diffexp.tsv")))


  fpkm_path <- file.path(BASE_ANALYSIS_DIR, "fpkm/all.tsv")
  threads <- 1
  report_path <- "big_yeeter.html"
  enrichment_term_type <- "SYMBOL"
}

## Read data
diff_exp_tables <- purrr::map(diffexp_tables_paths, readr::read_tsv, col_names = c(
  "gene_id", "baseMean", "logFoldChange",
  "lfcSE", "stat", "pvalue", "padj"
), skip = 1)
names(diff_exp_tables) <- contrast_list
diff_exp_frames <- purrr::map(diff_exp_tables, function(x) {
  gene_ids <- x$gene_id
  diff_exp_fr <- as.data.frame(x[, -1])
  rownames(diff_exp_fr) <- gene_ids
  diff_exp_fr
})
# names(diff_exp_frames) <- contrast_list Read FPKM file
fpkm <- readr::read_tsv(fpkm_path)

match_table <- fpkm %>%
  dplyr::select(c("gene", "gname"))
match_vec <- setNames(object = match_table$gname, nm = match_table$gene)


### Setupt mitch input data
prep_mitch_input <- function(deseq_list, e_term = "ENSEMBL") {
  # mitch_input_df <- mitch::mitch_import(diff_exp_frames, 'DESeq2', geneTable = as.data.frame(fpkm[,c('gene',
  # 'gname')]))
  mitch_input_df <- mitch::mitch_import(deseq_list, "DESeq2")
  gname_tb <- tibble::tibble(Transcript_id = rownames(mitch_input_df))
  if (e_term == "ENSEMBL") {
    gname_tb["gene_id"] <- stringr::str_extract(gname_tb$Transcript_id, "ENSMUSG[0-9]*")
  } else if (e_term == "SYMBOL") {
    gname_tb["gene_id"] <- match_vec[gname_tb$Transcript_id]
  }
  rownames(mitch_input_df) <- gname_tb$gene_id
  colnames(mitch_input_df) <- stringr::str_remove_all(colnames(mitch_input_df), pattern = "-")
  ifelse(anyDuplicated(gname_tb$gene_id), warning("Duplicated genes in mitch input dataframe."), print("no duplicates"))

  mitch_input_df
}
#' Uniquify a vector
#'
#' @param x Input factor to check
#' @return corrected facotr value
#' @examples
#' NULL
uniquify <- function(x) {
  while (anyDuplicated(x)) {
    x[duplicated(x)] <- paste0(x[duplicated(x)], "_1")
  }
  x
}
mitch_input_df <- prep_mitch_input(diff_exp_frames)
# mm_reactome <- buildReactome(output_type = enrichment_term_type) ensembl_reactome <- buildReactome(output_type =
# 'ENSEMBL')


if (enrichment_term_type == "SYMBOL") {
  # Get GENE ids
  new_ids <- ensembldb::select(edb, keys = rownames(mitch_input_df), keytype = "GENEID", columns = c("SYMBOL", "GENEID"))
  update_vector <- setNames(new_ids$SYMBOL, nm = new_ids$GENEID)
  # Stop if any duplicated genes in reactome gene set
  names(match_vec) <- stringr::str_extract(names(match_vec), "ENSMUSG[0-9]*")

  geneIDs <- match_vec[rownames(mitch_input_df)]
  geneIDs[names(update_vector)] <- update_vector


  col_names <- c("gs_name", "gene_symbol")
  mm_msigdbr <- msigdbr::msigdbr(species = "Mus musculus", category = "H") %>%
    dplyr::select(col_names)
  msigdb_gene_list <- split(x = mm_msigdbr$gene_symbol, f = mm_msigdbr$gs_name)
  ensembl_in_msigdb <- new_ids %>%
    dplyr::filter(SYMBOL %in% mm_msigdbr$gene_symbol) %>%
    pull(GENEID)

  ## Cleanup duplicates
  mitch_input_df <- mitch_input_df[(rownames(mitch_input_df) %in% ensembl_in_msigdb) | !(duplicated(geneIDs[rownames(mitch_input_df)]) |
    duplicated(geneIDs[rownames(mitch_input_df)], fromLast = T)), ]

  geneIDs <- match_vec[rownames(mitch_input_df)]
  # geneIDs[names(update_vector)] <- update_vector
  dup_mitch <- mitch_input_df[] %>%
    as_tibble(rownames = "ENSEMBL")
  dup_mitch["SYMBOL"] <- geneIDs[dup_mitch$ENSEMBL]
  # Remove samples which(dup_mitch$ENSEMBL %in% unlist(ensembl_reactome))
  final_name_vec <- setNames(dup_mitch$SYMBOL, nm = dup_mitch$ENSEMBL)
  final_name_vec <- final_name_vec[-which(!(final_name_vec %in% mm_msigdbr$gene_symbol) & duplicated(final_name_vec))]
  # final_name_vec['ENSMUSG00000051396'] <- 'Gm45902'
  if (anyDuplicated(final_name_vec)) {
    stop("Duplicate in gene names")
  }
  # geneIDs <- uniquify(geneIDs)
  mitch_input_df <- mitch_input_df[names(final_name_vec), ]
  rownames(mitch_input_df) <- final_name_vec
}

## Setup Databases to test msig_hallmarck <- msigdbr::msigdbr(species = 'Mus musculus')

# bain <- msig_hallmarck %>% dplyr::select(c('gs_name', 'gene_symbol')) %>% group_split(gs_name)

# more_bain <- purrr::map(bain, function(x){x$gene_symbol})

# mitch::mitch_calc(mitch_input_df, more_bain, cores = 6, priority ='significance')

# mm_reactome <- buildReactome(output_type = enrichment_term_type) print(colnames(mitch_input_df))
# print(anyDuplicated(colnames(mitch_input_df)))
reac_test <- mitch::mitch_calc(mitch_input_df, msigdb_gene_list, cores = threads)

readr::write_tsv(reac_test$enrichment_result, mitch_table)
saveRDS(reac_test, file = mitch_rds)

mitch::mitch_report(reac_test, outfile = report_path)

R Snakemake tidyverse DESeq2 org.Mm.eg.db tibble AnnotationHub msigdbr ensembldb mitch From line 4 of scripts/run_mitch.R

Snakemake pipeline for Gene Set Enrichment: pathway and GO enrichment analysis

library("edgeR")
library(biomaRt)
library(dplyr)
library("AnnotationDbi")
library("org.Hs.eg.db")
library(data.table)
library("gplots")
library(magrittr)
library(pathview)
library("org.Mm.eg.db")
library(gage)
library(gageData)
library(data.table)


args <- commandArgs(trailingOnly = TRUE)
dge = args[1]
ORGANISM  =args[2] 
compare_type = args[3]
outname = args[4]

res <- read.csv(file = dge, header = TRUE)
summary(res) 
foldchanges = res$logFC
names(foldchanges) = res$entrez
outname = paste(outname, compare_type, sep ="-")
outGO = paste(outname, "GO.csv", sep ="-")

if (ORGANISM == "HUMAN")
{
data(kegg.sets.hs)
data(go.sets.hs)
data(carta.hs)
data(sigmet.idx.hs)
data(go.subs.hs)
kegg.sets.hs = kegg.sets.hs[sigmet.idx.hs]
head(kegg.sets.hs,3)
#GO
data(go.sets.hs)
data(go.subs.hs)
gobpsets = go.sets.hs[go.subs.hs$BP]
gobpres = gage(foldchanges, gsets=kegg.sets.hs, same.dir =TRUE, compare =compare_type,cutoff=0.05)
lapply(gobpres, head)
write.csv(gobpres, outGO)

} else if (ORGANISM == "MOUSE"){
data(kegg.sets.mm)
data(go.sets.mm)
data(carta.mm)
data(sigmet.idx.mm)
data(go.subs.mm)


kegg.sets.mm = kegg.sets.mm[sigmet.idx.mm]
head(kegg.sets.mm,3)
data(go.sets.mm)
data(go.subs.mm)
gobpsets = go.sets.mm[go.subs.mm$BP]
gobpres = gage(foldchanges, gsets=kegg.sets.mm, same.dir =TRUE, compare =compare_type,cutoff=0.05)
lapply(gobpres, head)
write.csv(gobpres, outGO)
} 

R dplyr data.table org.Hs.eg.db org.Mm.eg.db magrittr biomaRt gplots AnnotationDbi gage From line 1 of scripts/GO.R

library("edgeR")
library(biomaRt)
library(dplyr)
library("AnnotationDbi")
library("org.Hs.eg.db")
library(data.table)
library("gplots")
library(magrittr)
library(pathview)
library("org.Mm.eg.db")
library(gage)
library(gageData)
library(data.table)


args <- commandArgs(trailingOnly = TRUE)
dge = args[1]
ORGANISM  =args[2] 
compare_type = args[3]
outname = args[4]
outname 

res <- read.csv(file = dge, header = TRUE)
outname = paste(outname, compare_type, sep ="-")
out = paste(outname, "KEGG.csv", sep="-")
outUP = paste(outname, "KEGG_UP.csv", sep ="-")
outDOWN = paste(outname, "KEGG_DOWN.csv", sep ="-")

foldchanges = res$logFC
names(foldchanges) <- res$entrez
summary(foldchanges)
#---------------------------
#KEGG Analysis 
#---------------------------
if (ORGANISM == "HUMAN")
{
data(kegg.sets.hs)
data(go.sets.hs)
data(carta.hs)
data(sigmet.idx.hs)
data(go.subs.hs)
kegg.sets.hs = kegg.sets.hs[sigmet.idx.hs]
head(kegg.sets.hs,3)
#---------------------------------------------------Use Kegg and gage to get upregulated and downregulated pathways
data(kegg.gs)
keggres = gage(foldchanges, gsets =kegg.sets.hs, same.dir = TRUE, compare=compare_type,make.plot = TRUE)
lapply(keggres, head)
write.csv(keggres,out)
keggrespathwaysup = data.frame(id=rownames(keggres$greater), keggres$greater) %>%
  tbl_df() %>%
  filter(row_number()<=30) %>%
  .$id %>%
  as.character()
keggrespathwaysdn = data.frame(id=rownames(keggres$less), keggres$less) %>%
  tbl_df() %>%
  filter(row_number()<=30) %>%
  .$id %>%
  as.character()
write.csv(keggrespathwaysup, outUP)
write.csv(keggrespathwaysdn, outDOWN)
#-------------------------------------------------------------------------------------------------------------------------------
keggresidsup = substr(keggrespathwaysup, start=1, stop=8)
keggresidsup
keggresidsdn = substr(keggrespathwaysdn, start=1, stop=8)
#gobpres = gage(foldchanges, gsets=kegg.sets.hs, same.dir =FALSE, compare ="paired",cutoff=0.05)
#lapply(gobpres, head)
#---------------------------------------------------------Define plotting function for applying later
plot_pathway = function(pid) pathview(gene.data=foldchanges,gene.idtype="ENTREZID", pathway.id=pid, species="human", new.signature=FALSE)
#---------------------------------------------------------plot multiple pathways ups and downs 
tmpup = sapply(keggresidsup, function(pid) pathview(gene.data=foldchanges,gene.idtype="ENTREZID", pathway.id=pid, species="human"))
tmpdn = sapply(keggresidsdn, function(pid) pathview(gene.data=foldchanges,gene.idtype="ENTREZID", pathway.id=pid, species="human"))
} else if (ORGANISM == "MOUSE"){
data(kegg.sets.mm)
data(go.sets.mm)
data(carta.mm)
data(sigmet.idx.mm)
data(go.subs.mm)
kegg.sets.mm = kegg.sets.mm[sigmet.idx.mm]
head(kegg.sets.mm,3)
#---------------------------------------------------Use Kegg and gage to get upregulated and downregulated pathways
data(kegg.gs)
keggres = gage(foldchanges, gsets =kegg.sets.mm, same.dir = TRUE, compare=compare_type,make.plot = TRUE)
lapply(keggres, head)
write.csv(keggres,out)
keggrespathwaysup = data.frame(id=rownames(keggres$greater), keggres$greater) %>%
  tbl_df() %>%
  filter(row_number()<=30) %>%
  .$id %>%
  as.character()
keggrespathwaysdn = data.frame(id=rownames(keggres$less), keggres$less) %>%
  tbl_df() %>%
  filter(row_number()<=30) %>%
  .$id %>%
  as.character()
write.csv(keggrespathwaysup, outUP)
write.csv(keggrespathwaysdn, outDOWN)
#-------------------------------------------------------------------------------------------------------------------------------
keggresidsup = substr(keggrespathwaysup, start=1, stop=8)
keggresidsup
keggresidsdn = substr(keggrespathwaysdn, start=1, stop=8)
#gobpres = gage(foldchanges, gsets=kegg.sets.mm, same.dir =FALSE, compare ="paired",cutoff=0.05)
#lapply(gobpres, head)
#---------------------------------------------------------Define plotting function for applying later
plot_pathway = function(pid) pathview(gene.data=foldchanges,gene.idtype="ENTREZID", pathway.id=pid, species="mouse", new.signature=FALSE)
#---------------------------------------------------------plot multiple pathways ups and downs 
tmpup = sapply(keggresidsup, function(pid) pathview(gene.data=foldchanges,gene.idtype="ENTREZID", pathway.id=pid, species="mouse"))
tmpdn = sapply(keggresidsdn, function(pid) pathview(gene.data=foldchanges,gene.idtype="ENTREZID", pathway.id=pid, species="mouse"))
} 

R dplyr data.table org.Hs.eg.db org.Mm.eg.db magrittr biomaRt gplots AnnotationDbi gage pathview From line 1 of scripts/pathway.R

Differential Gene Expression using RNA-Seq

library(knitr)
knitr::opts_chunk$set(echo = TRUE)
library(DESeq2)
library(ggplot2) 
library(knitr)
library(clusterProfiler)
library(biomaRt)
library(ReactomePA)
library(DOSE)
library(KEGG.db)
library(org.Mm.eg.db)
library(org.Hs.eg.db)
library(pheatmap)
library(genefilter)
library(RColorBrewer)
library(GO.db)
library(topGO)
library(dplyr)
library(gage)
library(ggsci)

R Markdown ggplot2 dplyr org.Hs.eg.db GO.db org.Mm.eg.db RColorBrewer knitr biomaRt ggsci genefilter topGO clusterProfiler gage DOSE ReactomePA From line 7 of master/README.Rmd

source("https://bioconductor.org/biocLite.R")
biocLite("DESeq2") ; library(DESeq2)
biocLite("ggplot2") ; library(ggplot2)
biocLite("clusterProfiler") ; library(clusterProfiler)
biocLite("biomaRt") ; library(biomaRt)
biocLite("ReactomePA") ; library(ReactomePA)
biocLite("DOSE") ; library(DOSE)
biocLite("KEGG.db") ; library(KEGG.db)
biocLite("org.Mm.eg.db") ; library(org.Mm.eg.db)
biocLite("org.Hs.eg.db") ; library(org.Hs.eg.db)
biocLite("pheatmap") ; library(pheatmap)
biocLite("genefilter") ; library(genefilter)
biocLite("RColorBrewer") ; library(RColorBrewer)
biocLite("GO.db") ; library(GO.db)
biocLite("topGO") ; library(topGO)
biocLite("dplyr") ; library(dplyr)
biocLite("gage") ; library(gage)
biocLite("ggsci") ; library(ggsci)

R Markdown ggplot2 dplyr DESeq2 org.Hs.eg.db GO.db org.Mm.eg.db RColorBrewer pheatmap biomaRt ggsci genefilter topGO clusterProfiler gage DOSE ReactomePA From line 472 of master/README.Rmd

Workflow Steps and Code Snippets

Code for the manuscript "Machine learning reveals STAT motifs as predictors for GR-mediated gene repression"

Workflow dervied form https://github.com/snakemake-workflows/rna-seq-star-deseq2 for multi condition Deseq2 and enrichment analyis (v0.1.1)

Snakemake pipeline for Gene Set Enrichment: pathway and GO enrichment analysis

Differential Gene Expression using RNA-Seq

org.Mm.eg.db