Workflow Steps and Code Snippets

11 tagged steps and code snippets that match keyword org.Mm.eg.db

Code for the manuscript "Machine learning reveals STAT motifs as predictors for GR-mediated gene repression"

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
suppressPackageStartupMessages(library(optparse, warn.conflicts=F, quietly=T))

option_list <- list(
  make_option(c( "--contrast_DexLPSvLPS"),
              type="character",
              help="Path to annotated tsv file of DeSeq2 contrast of DexLPS vs LPS"),
  make_option(c("--assignment_summit_prox"),
              type="character",
              help="Path to rds file of proximity based assignment of peak summits to genes"),
  make_option(c("--assignment_summits_abcregion_dexlps"),
              type="character",
              help="Path to rds file of assignment of peak summits within abcregions to genes (in DexLPS condition)"),
  make_option(c("--assignment_summits_abcregion_lps"),
              type="character",
              help="Path to rds file of assignment of peak summits within abcregions to genes (in LPS condition)"),
  make_option(c("--assignment_abcregion_dexlps"),
              type="character",
              help="Path to rds file of assignment of abcregions to genes (in DexLPS condition)"),
  make_option(c( "--assignment_abcregion_lps"),
              type="character",
              help="Path to rds file of assignment of abcregions to genes (in LPS condition)"),
  make_option(c("--motifcounts_summitregion"),
              type="character",
              help="Path to rds file of fimo motifcounts within summitregions"),
  make_option(c("--motifcounts_abcregion_dexlps"),
              type="character",
              help="Path to rds file of fimo motifcounts within ABC regions (in DexLPS condition)"),
  make_option(c("--motifcounts_abcregion_lps"),
              type="character",
              help="Path to rds file of fimo motifcounts within ABC regions (in LPS condition)"),
  make_option(c( "--outdir"),
              type="character",
              help="Path to output directory")
  )

opt <- parse_args(OptionParser(option_list=option_list))

dir.create(opt$outdir)

#change default for stringAsFactors
options(stringsAsFactors = FALSE)

suppressPackageStartupMessages(library(here, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(ggplot2, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(plyranges, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(org.Mm.eg.db, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(DESeq2, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(ComplexHeatmap, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(dplyr, warn.conflicts=F, quietly=T))

#set defaults for ggplot2 figures
theme_update(panel.background = element_rect(fill = "transparent", colour = NA),
             plot.background = element_rect(fill = "transparent", colour = NA),
             legend.background = element_rect(fill = "transparent", colour = NA),
             legend.key = element_rect(fill = "transparent", colour = NA),
             text=element_text(size=6, family = "ArialMT", colour="black"),
             title=element_text(size=8, family="ArialMT", colour="black"),
             panel.grid.major = element_line(colour="grey", size=0.2),
             panel.grid.minor = element_blank(),
             axis.text = element_text(size=6, family="ArialMT", colour="black"),
             axis.line = element_line(colour="black"),
             axis.ticks = element_line(colour="black"),
             legend.key.size = unit(6, 'points'), #change legend key size
             legend.key.height = unit(6, 'points'), #change legend key height
             legend.key.width = unit(6, 'points'), #change legend key width
             legend.text = element_text(size=6, family="ArialMT", colour="black"))

set.seed(12345)

#-------------------------------
## read in  data
#-------------------------------

contrast_DexLPSvLPS <- read.delim(opt$contrast_DexLPSvLPS)

for (optname in names(opt)[2:9]){ #except for the outdir
  print(paste0("Loading ", optname))
  assign(optname, readRDS( opt[[optname]] ))
}

#--------------------------------------------
## ---- function definitions
#--------------------------------------------

coerce_coef2df <- function(model_coef){
  model_coef <- as.matrix(model_coef)
  model_coef <- as.data.frame(model_coef)
  model_coef$names <- rownames(model_coef)
  colnames(model_coef) <- c("estimates", "names")
  return(model_coef)
}

#------------function will------------:
# * merge the motifdata with gene assignments and 
# * then merge the gene expression changes,
# * filter for DE genes and aggregate per gene
#--------------------------------------
merge_motifdata_with_assignments <- function( motifcounts, assignments, contrast, 
                                              maxonly=FALSE, excludepromoters=FALSE, weightby=FALSE, sepPromEnh=FALSE){

  # check if we should use all abc assignments, or only the max one of each peakID
  if(maxonly==TRUE){
    assignments <- assignments %>% group_by(name) %>% filter(abcscore==max(abcscore)) %>% distinct()
  } else{
    assignments <- assignments
  }

  if(excludepromoters=="all"){
    assignments <- assignments %>% filter(!class=="promoter")
  } else if (excludepromoters=="onlyNONself"){
    assignments <- assignments %>% filter(!c(class=="promoter" & isSelfPromoter=="False"))
  } else{
    assignments <- assignments
  }

  motifdf <- merge(motifcounts, assignments, 
                   by.x="name", by.y="name")

  motifdf <- motifdf %>% relocate(c(anno))

  # merge the expression change
  # use mgi_symbol for prox based, otherwise ensemblID
  # We DONT set all.x=TRUE because we don't care about predicting gene that aren't even expressed or that we don't have a clear label for
  if(maxonly=="prox"){
    motifdf <- merge(motifdf, contrast, 
                     by.x="anno", by.y="mgi_symbol")
  } else {
    motifdf <- merge(motifdf, contrast, 
                     by.x="anno", by.y="Row.names")
  }

  #recode the logFC and padj into a label (optionally through command line arguments)
  motifdf <- motifdf %>% 
    mutate(label=case_when(log2FoldChange>0.58 & padj < 0.05 ~ "up",
                           log2FoldChange<(-0.58) & padj < 0.05 ~ "down",
                           TRUE ~ "no_change")) %>%
    filter(label!="no_change") %>%
    mutate(label=factor(label,
                        levels=c("down","up"),
                        labels=c(0,1))) %>%
    relocate(label)

  # chr 2, 3 and 4 (20%) were used as the tuning set for hyperparameter tuning. 
  # Regions from chromosomes 1, 8 and 9 (20%) were used as the test set for performance evaluation 
  # The remaining regions were used for model training.

  #------aggregate over gene SYMBOL
  if ("abcscore" %in% colnames(motifdf)) { # loop for ABC based assignments

    if (weightby=="abcscore"){
      unselect_col <- "abcnumerator"
    } else if(weightby=="abcnumerator") {
      unselect_col <- "abcscore"
    } else{
      unselect_col <- c("abcscore","abcnumerator")
    }

    if (sepPromEnh==TRUE){
      motifdf_aggr <-
        motifdf %>% 
        dplyr::select(!c(unselect_col,"name","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj","gene_biotype","mgi_symbol")) %>%
        { if(weightby!=FALSE) mutate(.,across(where(is.numeric), ~ (.x * get(weightby)))) else .} %>% # weight features by score
        dplyr::select(!any_of(as.character(weightby))) %>% # then we can drop the score since it will be nonsensical after the aggregation anyways
        group_by(label,seqnames,anno,class) %>%
        dplyr::summarise(across( where(is.numeric), .fns=sum )) %>% # sum up genewise feature counts
        ungroup()

      # From here we need to cast the motifcounts for the promoterregions, so that in the end we have one row per gene (instead of 1-2)
      motifdf_aggr <- motifdf_aggr %>% tidyr::pivot_wider(id_cols=c(label,seqnames,anno), 
                                                          names_from=class, 
                                                          values_from = !c(label,seqnames,anno, class),
                                                          values_fill = 0)
    } else {
      motifdf_aggr <-
        motifdf %>% 
        dplyr::select(!c(unselect_col,"name","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj","gene_biotype","mgi_symbol")) %>%
        { if(weightby!=FALSE) mutate(., across(where(is.numeric), ~ (.x * get(weightby)))) else .} %>% # weight features by score
        dplyr::select(!any_of(as.character(weightby))) %>% # then we can drop the score since it will be nonsensical after the aggregation anyways
        group_by(label,seqnames,anno) %>%
        dplyr::summarise(across( where(is.numeric), .fns=sum )) %>% # sum up genewise feature counts
        ungroup()
    }


  } else { # loop for prox based assignments
    motifdf_aggr <-
      motifdf %>% 
      dplyr::select(!c("name","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj","gene_biotype")) %>% 
      group_by(label,seqnames,anno) %>%
      dplyr::summarise(across( where(is.numeric), .fns=sum )) %>% # sum up genewise feature counts
      ungroup()
  }

  return(motifdf_aggr)

}


#------------function needs------------:
# featurematrix and indexes to perform split of test vs trainval set
# (trainval set is used for CV to find optimal regularization)
#------------function will------------:
# run cv.glmnet (as elastic net regression) and pick lambda.1se as regularization
# determine model performance on test set
# plots for raw counts of certain factors are currently turned of
complete_GLM_analysis <- function(motifdf, trainvalidx, genenames){

  # Create training subset for model development & testing set for model performance testing
  # make it based on chromosomes, instead of completely random
  # as in bpnet, we set aside chr 1,8 and 9 for testing

  #inTrain <- sort(sample(nrow(motifdata_abc_aggr), nrow(motifdata_abc_aggr)*0.75))

  features_train <- motifdf[ trainvalidx, -c(1,2,3)] %>% as.matrix()
  features_test <- motifdf[ -trainvalidx, -c(1,2,3)] %>% as.matrix()
  targets_train <- motifdf[ trainvalidx, ] %>% pull(label) %>% as.numeric(levels(.))[.] %>% as.matrix()
  targets_test <- motifdf[ -trainvalidx, ] %>% pull(label) %>% as.numeric(levels(.))[.] %>% as.matrix()

  #------------------------
  ## Elastic net
  #------------------------

  cvfit_net <- glmnet::cv.glmnet(x=features_train, 
                                 y=targets_train, 
                                 family="binomial", 
                                 type.measure = "auc",
                                 nfolds = 6,
                                 alpha=0.5)


  # Performance on training data
  targets_train_net.prob <- predict(cvfit_net,
                                    type="response",
                                    newx = features_train,
                                    s = 'lambda.min')
  pred_train_net <- ROCR::prediction(targets_train_net.prob[,1], targets_train) #only need the probabilities for 1's

  auc_ROCR_train_net <- ROCR::performance(pred_train_net, measure = "auc") #to assess AUC for model
  auc_ROCR_train_net@y.values[[1]]

  # Predict on unseen data
  targets_net.prob <- predict(cvfit_net,
                              type="response",
                              newx = features_test,
                              s = 'lambda.min')
  pred_net <- ROCR::prediction(targets_net.prob[,1], targets_test) #only need the probabilities for 1's

  auc_ROCR_net <- ROCR::performance(pred_net, measure = "auc") #to assess AUC for model
  auc_ROCR_net@y.values[[1]]

  #------------------------
  ## Model coefficients
  #------------------------

  coef_df_net <- coerce_coef2df( coefficients(cvfit_net, s = 'lambda.min'))

  gg_net <- ggplot( data=coef_df_net %>% filter(abs(estimates)>0) )+
    geom_bar(aes(x=reorder(names, -estimates), y=estimates),
             stat="identity")+
    labs(title=" ",x="", y="Net coefs")+
    #coord_flip()+
    theme(axis.text.x = element_text(angle = 45, hjust=1),
          plot.margin = unit(c(0, 0, -6, 10), "points"))

  #------------------------
  # ROC curves
  #------------------------
  # calculate probabilities for TPR/FPR for predictions
  perf_train_net <- ROCR::performance(pred_train_net,"tpr","fpr")
  perf_net <- ROCR::performance(pred_net,"tpr","fpr")

  # plot ROC curve

  gg_ROC <- ggplot()+
    geom_line( aes(x=perf_train_net@x.values[[1]], y=perf_train_net@y.values[[1]], colour = "train_net") ) +
    geom_line( aes(x=perf_net@x.values[[1]],y=perf_net@y.values[[1]], colour = "test_net") ) +
    geom_abline(intercept=0,slope=1, linetype=4, colour="grey")+
    scale_colour_manual(values=c("darkblue", "blue"), 
                        name=" ",
                        breaks=c("train_net", "test_net"),
                        labels=c(paste("net train. AUC:",round( auc_ROCR_train_net@y.values[[1]], 2)) ,
                                 paste("net test. AUC:", round( auc_ROCR_net@y.values[[1]], 2))
                        )
    )+
    labs(x="False positive rate", y="True positive rate")+
    theme(
      legend.position=c(0.65,0.15)
    )

  # add metric to global variable
  new_metrics = data.frame(
    net_train = auc_ROCR_train_net@y.values[[1]],
    net_test = auc_ROCR_net@y.values[[1]] 
  )


  full_panel <- ggpubr::ggarrange(gg_ROC,gg_net, nrow=2,
                                  labels=c("A","B"),
                                  heights = c(1,1))

  results <- list()
  results[[1]] <- full_panel
  results[[2]] <- new_metrics
  results[[3]] <- coef_df_net
  results[[4]] <- cvfit_net
  return(results)
}



#------------------------------------------------
## initialize object to track performance metrics
#------------------------------------------------

AUC_metrics = data.frame(
  # parameter combination
  condition=character(),
  motifdata=character(),
  excludepromoters=character(),
  onlymax=character(),
  weight=character(),
  sepPromEnh=character(),
  # performance
  net_train = numeric(),
  net_test = numeric()
)

net_model_coefs = list()


#------------------------------------------------
## run proximity based
#------------------------------------------------

# This is independent of the ABC results (no need to loop through different assignment variations)

print("Running GLM on prox-based assignments")
my_rdsfile <- paste0(opt$outdir,"prox.rds")
if(!file.exists(my_rdsfile)){

  motifdata_aggr <- merge_motifdata_with_assignments(motifcounts_summitregion,
                                                     assignment_summit_prox,
                                                     contrast_DexLPSvLPS,
                                                     maxonly="prox",
                                                     excludepromoters=FALSE,
                                                     weightby = FALSE,
                                                     sepPromEnh = FALSE)
  motifdata_aggr_scaled <- motifdata_aggr %>% mutate(., across(where(is.numeric), ~(scale(.) %>% as.vector)))

  motifdata_aggr_tranval_idx <- motifdata_aggr_scaled %>% with(which(seqnames!="chr1" & seqnames!="chr8" & seqnames!="chr9"))

  performance <- complete_GLM_analysis (motifdata_aggr_scaled, motifdata_aggr_tranval_idx, genenames=genenames)

  #------------------------
  ## Raw counts
  #------------------------
  nr3c1counts <- motifdata_aggr %>% 
    group_by(label) %>% 
    dplyr::count(NR3C1) %>% 
    mutate(counts_fac = case_when(NR3C1>=1 ~ ">=1",
                                  NR3C1==0 ~ "0"), # aggregate it towards to top end
           counts_fac = factor(as.character(counts_fac), levels=c("0",">=1")) ) %>%
    group_by(counts_fac) %>% 
    mutate(freq = n / sum(n)) %>%
    group_by(label,counts_fac) %>%
    dplyr::summarize(sum_n=sum(n),
              sum_freq=sum(freq))

  gg_nr3c1counts <- ggplot(data=nr3c1counts , aes(x=counts_fac,y=sum_freq,fill=label)) + 
    geom_bar(stat="identity", position="dodge", alpha=0.5)+
    geom_text(aes(label=format(sum_freq, digits=2)),
              position = position_dodge(.9), 
              vjust = -0.5, 
              size = 3) + 
    scale_fill_manual(values=c("0"="blue", "1"="red"),
                      labels=c("DOWN","UP"))+
    labs(x="NR3C1 matches", y="% genes", fill="")+
    theme(
      plot.margin = unit(c(10, 6, 0, 10), "points")
    )
  gg_nr3c1counts

  relcounts <- motifdata_aggr %>% 
    group_by(label) %>% 
    dplyr::count(REL) %>% 
    mutate(counts_fac = case_when(REL>=1 ~ ">=1",
                                  REL==0 ~ "0"), # aggregate it towards to top end
           counts_fac = factor(as.character(counts_fac), levels=c("0",">=1")) ) %>%
    group_by(counts_fac) %>%
    mutate(freq = n / sum(n)) %>%
    group_by(label,counts_fac) %>%
    dplyr::summarize(sum_n=sum(n),
              sum_freq=sum(freq))

  gg_relcounts <- ggplot(data=relcounts , aes(x=counts_fac,y=sum_freq,fill=label)) + 
    geom_bar(stat="identity", position="dodge", alpha=0.5)+
    geom_text(aes(label=format(sum_freq, digits=2)),
              position = position_dodge(.9), 
              vjust = -0.5, 
              size = 3) + 
    scale_fill_manual(values=c("0"="blue", "1"="red"),
                      labels=c("DOWN","UP"))+
    labs(x="REL matches", y="% genes", fill="")+
    theme(
      plot.margin = unit(c(10, 6, 0, 10), "points")
    )
  gg_relcounts

  gg_rawcounts <-  ggpubr::ggarrange( gg_nr3c1counts, gg_relcounts, ncol=2, common.legend = TRUE, legend="bottom")
  saveRDS(gg_rawcounts, paste0(opt$outdir,"gg_rawcounts.rds"))

  #------------------------


  saveRDS(performance,
          file=my_rdsfile)
} else {
  performance <- readRDS(my_rdsfile)
}

plot(performance[[1]])

AUC_metrics <- rbind(AUC_metrics,
                     "prox" = c(
                       condition="dexlps",
                       motifdata="motifcounts_summitregion",
                       excludepromoters=FALSE,
                       onlymax="prox",
                       weight=FALSE,
                       sepPromEnh=FALSE,
                       performance[[2]]
                       )
                     )

net_model_coefs[["prox"]] <- performance[[3]]


## Raw counts
gg_firstgene <- ggplot() + 
  geom_bar(data=motifdata_aggr , aes(NR3C1,fill=label), position="dodge", alpha=0.5)+
  scale_fill_manual(values=c("0"="blue", "1"="red"),
                    labels=c("DOWN","UP"))+
  labs(x="NR3C1", y="counts", fill="")+
  theme(
    plot.margin = unit(c(10, 6, 0, 10), "points")
  )
gg_firstgene

#------------------------------------------------
## run example
#------------------------------------------------


# motifdata_aggr <- merge_motifdata_with_assignments(
#   motifcounts_summitregion,
#   assignment_summitregion_abc_dexlps,
#   contrast_DexLPSvLPS,
#   maxonly=FALSE, 
#   excludepromoters=FALSE, 
#   weightby=FALSE, 
#   sepPromEnh=FALSE)
# 
# motifdata_aggr_tranval_idx <- motifdata_aggr %>% with(which(seqnames!="chr1" & seqnames!="chr8" & seqnames!="chr9"))
# performance <- complete_GLM_analysis (motifdata_aggr, motifdata_aggr_tranval_idx, genenames=genenames)
# plot(performance[[1]])
# AUC_metrics <- rbind(AUC_metrics,
#                      "test" = c(condition="dexlps",
#                                 motifdata="motifcounts_peak",
#                                 onlymax=FALSE,
#                                 excludepromoters=FALSE,
#                                 weight=FALSE,
#                                 sepPromEnh=FALSE,
#                                 performance[[2]]))
# net_model_coefs[["test"]] <- performance[[3]]
# 
# gg_firstgene <- ggplot() + 
#   geom_bar(data=performance[[5]] , aes(NR3C1,fill=label), position="dodge", alpha=0.5)+
#   scale_fill_manual(values=c("0"="blue", "1"="red"),
#                     labels=c("DOWN","UP"))+
#   labs(x="NR3C1", y="counts", fill="")+
#   theme(
#     plot.margin = unit(c(10, 6, 0, 10), "points")
#   )
# gg_firstgene


#------------------------------------------------
## peakregion_counts - proxbased assignment
## peakregion_counts - abcbased assignment (both conditions)
## enhancerregion_counts - abcbased assignment (2 conditions)
#------------------------------------------------

# rewrite to assign values for both conditions and then use those to compute the difference as well

not_all_na <- function(x) any(!is.na(x))

for (motifdata in c("motifcounts_abcregion","motifcounts_summitregion")){
  for(excludepromoters in c(FALSE,"all","onlyNONself")){
    for (onlymax in c(TRUE,FALSE)){
      for (sepPromEnh in c(TRUE,FALSE)){
        for (weight in c(FALSE, "abcscore")){

          # set assignments fitting for the input data
          if(motifdata=="motifcounts_abcregion"){
            assignment_dexlps <- assignment_abcregion_dexlps
            assignment_lps <- assignment_abcregion_lps
            motifcounts_dexlps <- motifcounts_abcregion_dexlps
            motifcounts_lps <- motifcounts_abcregion_lps
          } else if (motifdata=="motifcounts_summitregion" ){
            # in this case the motifcounts for the 2 conditions are the same, but there assignments differ
            assignment_dexlps <- assignment_summits_abcregion_dexlps
            assignment_lps <- assignment_summits_abcregion_lps
            motifcounts_dexlps <- motifcounts_summitregion
            motifcounts_lps <- motifcounts_summitregion
          } else {break}

          #-----------------------DEXLPS------------------------------
          featurematrix_dexlps <- merge_motifdata_with_assignments(motifcounts_dexlps,
                                                                   assignment_dexlps,
                                                                   contrast_DexLPSvLPS,
                                                                   weightby=weight, 
                                                                   maxonly=onlymax, 
                                                                   excludepromoters=excludepromoters, 
                                                                   sepPromEnh=sepPromEnh)

          # remove columns that are NA after scaling 
          featurematrix_dexlps_scaled <- featurematrix_dexlps %>% 
            mutate(., across(where(is.numeric), ~(scale(.) %>% as.vector))) %>%
            dplyr::select(where(not_all_na))

          #-----

          motifdata_aggr_tranval_idx <- featurematrix_dexlps_scaled %>% with(which(seqnames!="chr1" & seqnames!="chr8" & seqnames!="chr9"))
          modelname <- paste(motifdata,"condition_dexlps_exclprom",excludepromoters,"onlymax",onlymax,"sepPromEnh",sepPromEnh,"weight",weight, sep="_")
          print(modelname)

          my_rdsfile <- here( paste0(opt$outdir, modelname,".rds"))
          if(!file.exists(my_rdsfile)){
            performance <- complete_GLM_analysis (featurematrix_dexlps_scaled, motifdata_aggr_tranval_idx, genenames=genenames)
            saveRDS(performance,file=my_rdsfile)
          } else {
            performance <- readRDS(my_rdsfile)
          }

          AUC_metrics <- rbind(AUC_metrics,
                               c(
                                 condition="dexlps",
                                 motifdata=motifdata,
                                 excludepromoters=excludepromoters,
                                 onlymax=onlymax,
                                 weight=weight,
                                 sepPromEnh=sepPromEnh,
                                 performance[[2]]
                                 )
                               )%>%
            magrittr::set_rownames(c(rownames(AUC_metrics),modelname))

          net_model_coefs[[modelname]] <- performance[[3]]

          #--------------------------LPS---------------------------
          featurematrix_lps <- merge_motifdata_with_assignments(motifcounts_lps,
                                                                assignment_lps,
                                                                contrast_DexLPSvLPS,
                                                                weightby=weight, 
                                                                maxonly=onlymax, 
                                                                excludepromoters=excludepromoters, 
                                                                sepPromEnh=sepPromEnh)
          featurematrix_lps_scaled <- featurematrix_lps %>% 
            mutate(., across(where(is.numeric), ~(scale(.) %>% as.vector))) %>%
            dplyr::select(where(not_all_na))
          #-----
          motifdata_aggr_tranval_idx <- featurematrix_lps_scaled %>% with(which(seqnames!="chr1" & seqnames!="chr8" & seqnames!="chr9"))
          modelname <- paste(motifdata,"condition_lps_exclprom",excludepromoters,"onlymax",onlymax,"sepPromEnh",sepPromEnh,"weight",weight, sep="_")
          print(modelname)

          my_rdsfile <- here(paste0(opt$outdir, modelname,".rds"))
          if(!file.exists(my_rdsfile)){
            performance <- complete_GLM_analysis (featurematrix_lps_scaled, motifdata_aggr_tranval_idx, genenames=genenames)
            saveRDS(performance,file=my_rdsfile)
          } else {
            performance <- readRDS(my_rdsfile)
          }

          AUC_metrics <- rbind(AUC_metrics,
                               c(
                                 condition="lps",
                                 motifdata=motifdata,
                                 excludepromoters=excludepromoters,
                                 onlymax=onlymax,
                                 weight=weight,
                                 sepPromEnh=sepPromEnh,
                                 performance[[2]]
                                 )
                               )%>%
            magrittr::set_rownames(c(rownames(AUC_metrics),modelname))

          net_model_coefs[[modelname]] <- performance[[3]]


          #-----------------------DIFFERENCE------------------------------

          # they contain the same motifs, but not the same genes. 
          # doublecheck that all columnnames are identical
          table(colnames(featurematrix_dexlps) == colnames(featurematrix_lps))

          #  motifcounts missing in one condition should be set to 0 

          merged_featurematrix <- merge(featurematrix_dexlps,
                                        featurematrix_lps, 
                                        by=c("anno","label","seqnames"),
                                        all=TRUE)

          # replace missing counts in one of the conditions with 0
          merged_featurematrix[is.na(merged_featurematrix)] <- 0

          # use dataframe suffix to grab respective columns
          featurematrix_diff <- 
            merged_featurematrix[grep(".x$",colnames(merged_featurematrix))] - 
            merged_featurematrix[grep(".y$",colnames(merged_featurematrix))] 
          # tidy up column names
          colnames(featurematrix_diff) <- gsub(".x$","", colnames(featurematrix_diff))

          # add first 3 columns back after computing the difference of the counts
          featurematrix_diff <- cbind(merged_featurematrix[1:3],featurematrix_diff)

          featurematrix_diff_scaled <- featurematrix_diff %>% 
            mutate(., across(where(is.numeric), ~(scale(.) %>% as.vector))) %>%
            dplyr::select(where(not_all_na))

          # run GLM and look at performance  
          featurematrix_diff_tranval_idx <- featurematrix_diff_scaled %>% with(which(seqnames!="chr1" & seqnames!="chr8" & seqnames!="chr9"))

          modelname <- paste(motifdata,"condition_DexLPS-LPS_exclprom",excludepromoters,"onlymax",onlymax,"sepPromEnh",sepPromEnh,"weight",weight, sep="_")
          print(modelname)
          my_rdsfile <- here(paste0(opt$outdir, modelname,".rds"))
          if(!file.exists(my_rdsfile)){
            performance <- complete_GLM_analysis (featurematrix_diff_scaled, featurematrix_diff_tranval_idx, genenames=genenames)
            saveRDS(performance,file=my_rdsfile)
          } else {
            performance <- readRDS(my_rdsfile)
          }

          AUC_metrics <- rbind(AUC_metrics,
                               c(
                                 condition="DexLPS-LPS",
                                 motifdata=motifdata,
                                 excludepromoters=excludepromoters,
                                 onlymax=onlymax,
                                 weight=weight,
                                 sepPromEnh=sepPromEnh,
                                 performance[[2]]
                                 )
                               )%>%
            magrittr::set_rownames(c(rownames(AUC_metrics),modelname))

          net_model_coefs[[modelname]] <- performance[[3]]


        }
      }
    }
  }
}


#------------------------------------------------
## put model coefs into dataframes
#------------------------------------------------
motifnames <- colnames(motifcounts_summitregion)[3:ncol(motifcounts_summitregion)]

# make empty dataframe with all motifs
model_coefs_joint <- data.frame(featurename=motifnames)

# when we treat promoters and enhancers separately, the featurenames differ
model_coefs_sep <- data.frame(featurename= c(paste(motifnames,"genic",sep="_"),
                                             paste(motifnames,"promoter",sep="_"),
                                             paste(motifnames,"intergenic",sep="_"))
)

for (model in names(net_model_coefs)){
  if (grepl("sepPromEnh_FALSE",model)){
    model_coefs_joint <- merge(model_coefs_joint,
                               net_model_coefs[[model]],
                               by.x="featurename", by.y="names",
                               all=TRUE) %>% 
      dplyr::rename(!! model := "estimates")
  } else if (model=="prox"){
    model_coefs_joint <- merge(model_coefs_joint,
                               net_model_coefs[[model]],
                               by.x="featurename", by.y="names",
                               all=TRUE) %>% 
      dplyr::rename(!! model := "estimates")
  } else if (grepl("sepPromEnh_TRUE",model)){
    model_coefs_sep <- merge(model_coefs_sep,
                             net_model_coefs[[model]],
                             by.x="featurename", by.y="names",
                             all=TRUE) %>% 
      dplyr::rename(!! model := "estimates")
  } else { stop("The modelnames don't match the expected pattern.") }
}

#------------------------------------------------
## save data for plotting
#------------------------------------------------
saveRDS (model_coefs_joint, paste0(opt$outdir, "model_coefs_joint.rds"))
saveRDS (model_coefs_sep, paste0(opt$outdir, "model_coefs_sep.rds"))
saveRDS (AUC_metrics, paste0(opt$outdir, "AUC_metrics.rds"))

#------------------------------------------------
sessionInfo()
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
suppressPackageStartupMessages(library(optparse, warn.conflicts=F, quietly=T))

option_list <- list(
  make_option(c( "--ABC_DexLPS_all"),
              type="character",
              help="Path to file with all ABC predictions passing 0.02 (in DexLPS condition)"),
  make_option(c("--ABC_LPS_all"),
              type="character",
              help="Path to file with all ABC predictions passing 0.02 (in LPS condition)"),
  make_option(c("--contrast_DexVSDexLPS"),
              type="character",
              help="Path to annotated tsv file of DeSeq2 contrast of DexLPS vs LPS"),
  make_option(c("--chipseq_summits"),
              type="character",
              help="Path to summit file of IDR peaks"),
  make_option(c("--igv"),
              type="character",
              help="Path to IGV snapshot of genomic locus (created manually)")
)

opt <- parse_args(OptionParser(option_list=option_list))

options(stringsAsFactors = FALSE)

suppressPackageStartupMessages(library(dplyr, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(org.Mm.eg.db, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(DESeq2, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(ggplot2, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(ComplexHeatmap, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(TxDb.Mmusculus.UCSC.mm10.knownGene, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(topGO, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(ggExtra, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(plyr, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(here, warn.conflicts=F, quietly=T))

#set defaults for ggplot2 figures
theme_update(panel.background = element_rect(fill = "transparent", colour = NA),
             plot.background = element_rect(fill = "transparent", colour = NA),
             legend.background = element_rect(fill = "transparent", colour = NA),
             legend.key = element_rect(fill = "transparent", colour = NA),
             text=element_text(size=8, family = "ArialMT", colour="black"),
             title=element_text(size=10, family="ArialMT", colour="black"),
             panel.grid.major = element_blank(),
             panel.grid.minor = element_blank(),
             axis.text = element_text(size=8, family="ArialMT", colour="black"),
             axis.line = element_line(colour="black"),
             axis.ticks = element_line(colour="black"))

#----------------------------------------------------------------------
##--NOTE: example code for additional plots can be found in abc_visualizations.Rmd
#----------------------------------------------------------------------

#----------------------------------------------------------------------
##----------- load data
#----------------------------------------------------------------------

chipseq_summits <- rtracklayer::import.bed(opt$chipseq_summits)

contrast_DexLPSvLPS <- read.delim(opt$contrast_DexVSDexLPS)
# rename first column for merge later
colnames(contrast_DexLPSvLPS)[1] <- "EnsemblID"

ABC_DexLPS_all <- read.delim(opt$ABC_DexLPS_all)
ABC_LPS_all <- read.delim(opt$ABC_LPS_all)
# remove version number from ensembl IDs
ABC_DexLPS_all$TargetGene = gsub("\\.[0-9]*$","",ABC_DexLPS_all$TargetGene)
ABC_LPS_all$TargetGene = gsub("\\.[0-9]*$","",ABC_LPS_all$TargetGene)


#----------------------------------------------------------------------
##----------- data exploration
#----------------------------------------------------------------------

# What do the distributions of ABC scores look like

ggplot()+
  geom_density(data=ABC_DexLPS_all %>% filter(!class=="promoter"), aes(x=ABC.Score, colour="DexLPS"))+
  geom_density(data=ABC_LPS_all %>% filter(!class=="promoter"), aes(x=ABC.Score, colour="LPS"))+
  scale_colour_manual("",
                      breaks = c("DexLPS", "LPS"),
                      values = c("#339966", "#0066CC"))

# How do the ABC scores of promoters compare to those of enhancers?

ggplot()+
geom_density(data=ABC_DexLPS_all %>% filter(class=="promoter"), aes(x=ABC.Score, colour="promoter"))+
geom_density(data=ABC_DexLPS_all %>% filter(class=="genic"), aes(x=ABC.Score, colour="genic"))+
geom_density(data=ABC_DexLPS_all %>% filter(class=="intergenic"), aes(x=ABC.Score, colour="intergenic"))+
scale_colour_manual("",
                    breaks = c("promoter", "genic", "intergenic"),
                    values = c("red", "green", "blue"))

# What distance is used to define sth as promoter?

ggplot()+
  geom_density(data=ABC_DexLPS_all, aes(x=distance, colour=class))+
  scale_x_continuous(trans = "log10")+
  scale_colour_manual("",
                      breaks = c("promoter", "genic", "intergenic"),
                      values = c("red", "green", "blue"))

ggplot()+
  geom_density(data=ABC_DexLPS_all %>% filter(class=="promoter"), aes(x=distance, colour=isSelfPromoter))+
  scale_x_continuous(trans = "log10")+
  scale_colour_manual("isSelfPromoter",
                      breaks = c("True", "False"),
                      values = c("red", "blue"))+
  labs(title="All promoter regions, split by whether they're promoters for their target gene")+
  theme(
    plot.title = element_text(size=12)
  )
#----------------------------------------------------------------------
# ------ ------  Number of enhancers per gene
#----------------------------------------------------------------------
# Are most genes regulated by a single enhancer region or by multiple? How many does the ABC model assign per gene?

plot_enhancers_per_gene <- function(){
  DexLPS_TargetGeneCounts <- plyr::count(ABC_DexLPS_all %>% filter(!class=="promoter"), vars="TargetGene")
  LPS_TargetGeneCounts <- plyr::count(ABC_LPS_all %>% filter(!class=="promoter"), vars="TargetGene")

  gg <- ggplot()+
    geom_histogram(data=DexLPS_TargetGeneCounts, aes(x=freq, fill="DexLPS", colour="DexLPS"), binwidth = 1,  alpha=0.2)+
    geom_histogram(data=LPS_TargetGeneCounts, aes(x=freq, fill="LPS", colour="LPS"), binwidth = 1, alpha=0.2)+
    scale_fill_manual("",
                      breaks = c("DexLPS", "LPS"),
                      values = c("#339966", "#0066CC")) +
    scale_colour_manual("",
                        breaks = c("DexLPS", "LPS"),
                        values = c("#339966", "#0066CC")) +
    guides(color = "none")+
    theme(legend.position = c(0.8,0.8))+
    labs(title=" ",
         x="# of enhancers per gene",
         y="counts")

  return(gg)
}

gg_enh_per_gene <- plot_enhancers_per_gene()
gg_enh_per_gene

print("We find an average of")
plyr::count(ABC_DexLPS_all %>% filter(!class=="promoter"), vars="TargetGene") %>% pull(freq) %>% mean()
print("enhancers per gene for the DexLPS condition and")
plyr::count(ABC_LPS_all %>% filter(!class=="promoter"), vars="TargetGene") %>% pull(freq) %>% mean()
print("in LPS.")

#----------------------------------------------------------------------
# ------ ------  Number of genes per enhancer
#----------------------------------------------------------------------
# ABC allows for multiple assignments (an individual enhancer assigned to more than one promoter)  

plot_genes_per_enhancer <- function(){

  DexLPS_EnhancerCounts <- plyr::count(ABC_DexLPS_all %>% filter(!class=="promoter"),vars="name")
  LPS_EnhancerCounts <- plyr::count(ABC_LPS_all %>% filter(!class=="promoter"),vars="name")

  gg_full <- ggplot()+
    geom_histogram(data=DexLPS_EnhancerCounts, aes(x=freq, fill="DexLPS",colour="DexLPS"), binwidth = 1, alpha=0.2)+
    geom_histogram(data=LPS_EnhancerCounts, aes(x=freq, fill="LPS",colour="LPS"), binwidth = 1, alpha=0.2)+
    scale_fill_manual("",
                      breaks = c("DexLPS", "LPS"),
                      values = c("#339966", "#0066CC")) +
    scale_colour_manual("",
                        breaks = c("DexLPS", "LPS"),
                        values = c("#339966", "#0066CC")) +
    guides(color = FALSE)+
    labs(title="Number of genes an individual enhancer is assigned to",
         x="# of genes per enhancer",
         y="counts")
  plot(gg_full)

  gg_zoom <- ggplot()+
    geom_histogram(data=DexLPS_EnhancerCounts, aes(x=freq, fill="DexLPS",colour="DexLPS"), binwidth = 1, alpha=0.2)+
    geom_histogram(data=LPS_EnhancerCounts, aes(x=freq, fill="LPS",colour="LPS"), binwidth = 1, alpha=0.2)+
    scale_fill_manual("",
                      breaks = c("DexLPS", "LPS"),
                      values = c("#339966", "#0066CC")) +
    scale_colour_manual("",
                        breaks = c("DexLPS", "LPS"),
                        values = c("#339966", "#0066CC")) +
    xlim(c(0,8))+
    guides(color = FALSE)+
    theme(legend.position = c(0.8,0.8))+
    labs(title=" ",
         x="# of genes per enhancer",
         y="counts")
  plot(gg_zoom)

  print("We find an average of")
  DexLPS_EnhancerCounts %>% pull(freq) %>% mean() %>% print()
  print("genes per enhancer in the DexLPS condition and")
  LPS_EnhancerCounts %>% pull(freq) %>% mean() %>% print()
  print("in LPS.")

  return(gg_zoom)
}

gg_genes_per_enhancer_zoom <- plot_genes_per_enhancer()


#----------------------------------------------------------------------
# ------ Comparing ABC Scores (including promoter regions) between the conditions
#----------------------------------------------------------------------

# merge regions from both conditions but also keep those that are only in one
# 
# add info no whether the target genes are DE or now and what direction they're changed

LPS_GR<- plyranges::as_granges(ABC_LPS_all, seqnames=chr)
DexLPS_GR <- plyranges::as_granges(ABC_DexLPS_all, seqnames=chr)

#----------------------------------------------------------------------
merge_LPSandDexLPS_ABCdata <- function(ABC_LPS_all_GR, ABC_DexLPS_all_GR){
  ABC_LPS_all_unique <- unique(ABC_LPS_all_GR)
  ABC_DexLPS_all_unique <- unique(ABC_DexLPS_all_GR)

  # and then figure out which ones overlap with one another
  overlap_ABC_all_unique <- ChIPpeakAnno::findOverlapsOfPeaks (ABC_LPS_all_unique,
                                                               ABC_DexLPS_all_unique,
                                                               connectedPeaks = "keepAll",
                                                               ignore.strand = TRUE)
  # we coerce the info of overlapping regions into a dataframe
  overlap_ABC_all_unique_df <- as.data.frame(overlap_ABC_all_unique$overlappingPeaks)

  # and clean up column names
  colnames(overlap_ABC_all_unique_df)<- gsub("ABC_LPS_all_unique...ABC_DexLPS_all_unique.","", colnames(overlap_ABC_all_unique_df))
  colnames(overlap_ABC_all_unique_df)<- gsub(".1", "", colnames(overlap_ABC_all_unique_df))
  colnames(overlap_ABC_all_unique_df)<- paste( c(rep("LPS_",27),rep("DexLPS_",27),rep("",2)), colnames(overlap_ABC_all_unique_df), sep="")

  # we use this overlap to define a new variable that holds the info of which regions of the conditions form a pair
  overlap_ABC_all_unique_df$pairID <- paste("pair", seq(1, nrow(overlap_ABC_all_unique_df)), sep="" )

  #-----------------for DexLPS-------------------
  # merge the pairID onto the original dataframes
  ABC_DexLPS_all_wpairID <- merge(ABC_DexLPS_all,
                                  overlap_ABC_all_unique_df[,c("DexLPS_name","pairID")],
                                  by.x="name",
                                  by.y="DexLPS_name",
                                  all.x=TRUE)
  # for those that don't correspond to a pair, we assign condition and name as pairID
  ABC_DexLPS_all_wpairID <- ABC_DexLPS_all_wpairID %>% 
    mutate(pairID = case_when(is.na(pairID) ~ paste("DexLPS",name, sep="_"),
                              TRUE ~ pairID) )
  # combination of pairID and gene they're assigned to forms the assignment variable.
  # we will use this later to plot values from the 2 conditions, that have the same assignment
  ABC_DexLPS_all_wpairID$assignment <- paste(ABC_DexLPS_all_wpairID$pairID,
                                             ABC_DexLPS_all_wpairID$TargetGene,
                                             sep="_")

  #-----------------for LPS-------------------
  # merge the pairID onto the original dataframes
  ABC_LPS_all_wpairID <- merge(ABC_LPS_all,
                               overlap_ABC_all_unique_df[,c("LPS_name","pairID")],
                               by.x="name",
                               by.y="LPS_name",
                               all.x=TRUE)
  # for those that don't correspond to a pair, we assign condition and name as pairID
  ABC_LPS_all_wpairID <- ABC_LPS_all_wpairID %>% 
    mutate(pairID = case_when(is.na(pairID) ~ paste("LPS",name, sep="_"),
                              TRUE ~ pairID) )

  ABC_LPS_all_wpairID$assignment <- paste(ABC_LPS_all_wpairID$pairID,
                                          ABC_LPS_all_wpairID$TargetGene,
                                          sep="_")

  merged_conditions <- merge(ABC_DexLPS_all_wpairID,
                                 ABC_LPS_all_wpairID,
                                 by="assignment", 
                                 all=TRUE,
                                 suffixes = c(".DexLPS", ".LPS"))
  merged_conditions <- merged_conditions %>%
    mutate(TargetGene = case_when(TargetGene.DexLPS==TargetGene.LPS ~ TargetGene.DexLPS,
                                  is.na(TargetGene.DexLPS) ~ TargetGene.LPS,
                                  is.na(TargetGene.LPS) ~ TargetGene.DexLPS,
                                  TRUE ~ NA_character_)
    )

  return(merged_conditions)
}

merged_conditions_ABC <- merge_LPSandDexLPS_ABCdata(LPS_GR,DexLPS_GR)

#----------------------------------------------------------------------
# ------ merge on gene expression results
#----------------------------------------------------------------------

# merge the RNAseq results to use as colour later
merged_conditions_ABC <- merge(merged_conditions_ABC, contrast_DexLPSvLPS, 
                               by.x="TargetGene", by.y="EnsemblID")
merged_conditions_ABC <- merged_conditions_ABC %>%
  mutate(change=case_when(padj<0.05 & log2FoldChange>0.58 ~ "up",
                          padj<0.05 & log2FoldChange<(-0.58) ~ "down",
                          TRUE ~ "no change"))

# replace the na with 0
merged_conditions_ABCnomissing <- merged_conditions_ABC %>% 
  mutate(ABC.Score.DexLPS = case_when(is.na(ABC.Score.DexLPS) ~ 0,
                                      TRUE ~ ABC.Score.DexLPS) ) %>%
  mutate(ABC.Score.LPS = case_when(is.na(ABC.Score.LPS) ~ 0,
                                   TRUE ~ ABC.Score.LPS) )

cor_ABCscores <- cor(merged_conditions_ABCnomissing$ABC.Score.DexLPS,
                     merged_conditions_ABCnomissing$ABC.Score.LPS)

gg_merged_conditions_ABCnomissing <- ggplot(merged_conditions_ABCnomissing)+
  geom_point(aes(x=ABC.Score.DexLPS, y=ABC.Score.LPS, fill=log2FoldChange), alpha=0.2, size=1, stroke = 0, shape=21)+
  scale_fill_gradient(low="blue", high="red")+
  ylim(c(0,0.8))+
  xlim(c(0,0.8))+
  geom_text(x=0.2, y=0.75, label=paste("r = ",format(cor_ABCscores, digits = 2)))+
  labs(fill="log2FC", x="ABC score Dex+LPS", y="ABC score LPS")+
  theme(legend.position = "top")

gg_merged_conditions_ABCnomissing

#----------------------------------------------------------------------
# ------ plot foldchange in ABC score vs foldchange in expression
#----------------------------------------------------------------------

cor_ABCdiff_log2FC <- with(merged_conditions_ABCnomissing %>% filter(change!="no change"),
                           cor((ABC.Score.DexLPS - ABC.Score.LPS),log2FoldChange))

gg_ABCdiff_log2FC <- ggplot(merged_conditions_ABCnomissing %>% filter(change!="no change"), aes(x=(ABC.Score.DexLPS - ABC.Score.LPS), y=log2FoldChange)) +
  geom_hex(bins=70)+
  viridis::scale_fill_viridis()+
  geom_smooth(method = "lm", colour="black")+
  geom_text(x=0.6, y=5, label=paste("r = ",format(cor_ABCdiff_log2FC, digits = 2)))+
  labs(x="ABC score Dex+LPS - ABC score LPS", y="expression log2FC(Dex+LPS / LPS)")+
  theme(legend.position = "top")



#----------------------------------------------------------------------
# ------ ABC score by peak presence
#----------------------------------------------------------------------

plot_ABC_with_marginals <- function(whatchange){
  gg <- ggplot(data= merged_conditions_ABCnomissing %>% filter(change==whatchange) , 
               aes(x=ABC.Score.DexLPS, y=ABC.Score.LPS ) )+
    geom_point( aes( fill=whatchange) , size=1, alpha=1,stroke = 0, shape=21) +
    geom_abline(slope = 1)+
    ylim(c(0,0.8))+
    xlim(c(0,0.8))+
    labs(x="ABC score Dex+LPS", y="ABC score LPS")+
    scale_fill_manual(
      breaks=c("up","no change","down"),
      values=c("red","black","blue") )+
    theme(legend.position = "none")

  gg_marg <- ggExtra::ggMarginal(gg, type="histogram", size=2,
                                 xparams = list(bins=85), yparams = list(bins=85))

  return(gg_marg)
}

ggplot_ABC_with_marginals_up <- plot_ABC_with_marginals("up")
ggplot_ABC_with_marginals_up
ggplot_ABC_with_marginals_down <-plot_ABC_with_marginals("down")
ggplot_ABC_with_marginals_down

#----------------------------------------------------------------------
# ------ show marginals as bar plot
#----------------------------------------------------------------------
gg_marginals_dexlps <- 
  ggplot()+
  geom_histogram(data=merged_conditions_ABCnomissing %>% filter(change=="up"), aes(x=ABC.Score.DexLPS, fill="up"), alpha=0.2, bins=85)+
  geom_histogram(data=merged_conditions_ABCnomissing %>% filter(change=="down"), aes(x=ABC.Score.DexLPS, fill="down"), alpha=0.2, bins=85)+
  scale_fill_manual(
    breaks=c("up","down"),
    values=c("red","blue") )+
  xlim(c(NA,0.3))+
  labs(fill="gene expression change",x="ABC score DexLPS", y="ABC score LPS")+
  theme(legend.position = c(0.7,0.7))
gg_marginals_lps <- 
  ggplot()+
  geom_histogram(data=merged_conditions_ABCnomissing %>% filter(change=="up"), aes(x=ABC.Score.LPS, fill="up"), alpha=0.2, bins=85)+
  geom_histogram(data=merged_conditions_ABCnomissing %>% filter(change=="down"), aes(x=ABC.Score.LPS, fill="down"), alpha=0.2, bins=85)+
  scale_fill_manual(
    breaks=c("up","down"),
    values=c("red","blue") )+ 
  xlim(c(NA,0.3))+
  labs(fill="gene expression change",x="ABC score DexLPS", y="ABC score LPS")+
  theme(legend.position = c(0.7,0.7))

#----------------------------------------------------------------------
# ------ ABC score by peak presence
#----------------------------------------------------------------------
# Is there a difference in ABC score between the enhancers overlapping with a GR summit vs the ones that don't?


DexLPS_GR_summits_overlap <- IRanges::findOverlaps(DexLPS_GR,chipseq_summits) #(query,subject)
DexLPS_GR$haspeakID <- FALSE
DexLPS_GR$haspeakID[queryHits(DexLPS_GR_summits_overlap)] <- TRUE


plot_ABCscore_ofDEgenes_byGRoverlap <- function(){
  DE_ENSEMBL <- contrast_DexLPSvLPS %>%  filter(padj<0.05 & abs(log2FoldChange)>0.58 ) %>% pull(EnsemblID)

  DexLPS_enhancers_DEsubset <- 
    as.data.frame(DexLPS_GR) %>% 
    filter(TargetGene %in% DE_ENSEMBL) %>%
    filter(!class=="promoter")

  DexLPS_enhancers_DEsubset %>% filter(haspeakID==TRUE) %>% dplyr::pull(ABC.Score) %>% mean() %>% print()
  DexLPS_enhancers_DEsubset %>% filter(haspeakID==FALSE) %>% dplyr::pull(ABC.Score) %>% mean() %>% print()

  # show distribution for the two groups

  gg1 <- ggplot()+
    geom_density(data=DexLPS_enhancers_DEsubset %>% filter(haspeakID==TRUE), 
                 aes(x=ABC.Score, colour="haspeakID", fill="haspeakID"), alpha=0.3)+
    geom_density(data=DexLPS_enhancers_DEsubset %>% filter(haspeakID==FALSE), 
                 aes(x=ABC.Score, colour="nopeakID", fill="nopeakID"), alpha=0.3)+
    scale_x_continuous(trans = "log2")+
    scale_colour_manual("",
                        labels = c("has GR peak", "no GR peak"),
                        breaks = c("haspeakID", "nopeakID"),
                        values = c("darkmagenta", "cadetblue"))+
    scale_fill_manual("",
                      labels = c("has GR peak", "no GR peak"),
                      breaks = c("haspeakID", "nopeakID"),
                      values = c("darkmagenta", "cadetblue"))+
    guides(colour="none")+
    #xlim(c(-3,0))+
    labs(title=" ",
         x="ABC score Dex+LPS",
         y="density")+
    theme(
      axis.text = element_text(size=12, family="ArialMT", colour="black"),
      axis.title = element_text(size=12, family="ArialMT", colour="black"),
      axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
      legend.position = c(0.8,0.8)
    )
  plot(gg1)

  gg2 <- DexLPS_enhancers_DEsubset %>%
    ggplot( aes(x=hic_contact_pl_scaled_adj, y=log2(activity_base))) +
    geom_hex()+
    facet_wrap(~haspeakID, labeller="label_both")+
    viridis::scale_fill_viridis()+
    labs(title = " ", x="Hi-C contact", y="log2(base activity)")

  plot(gg2)
  all_plots=list()
  all_plots[[1]] <- gg1
  all_plots[[2]] <- gg2
  return(all_plots)
}

gg_ABCscore_ofDEgenes_byGRoverlap <- plot_ABCscore_ofDEgenes_byGRoverlap()


#----------------------------------------------------------------------
# ------ load IGV plot
#----------------------------------------------------------------------
igv <- png::readPNG( opt$igv )

gg_igv <- ggplot() + 
  ggpubr::background_image(igv) +
  # so it doesn't get squished
  #coord_fixed()+
  # This ensures that the image leaves some space at the edges
  theme(plot.margin = margin(t=0, l=0, r=0, b=0, unit = "cm"),
        axis.line = element_blank())


#----------------------------------------------------------------------
# ------ put figure panel together
#----------------------------------------------------------------------

# save giant files separately
ggsave(here("./results/current/Figures/Figure_abcresults_3B.bmp"), gg_merged_conditions_ABCnomissing,
       width=75, height=100, units="mm",
       bg="white")
ggsave(here("./results/current/Figures/Figure_abcresults_3B.png"), gg_merged_conditions_ABCnomissing,
       width=75, height=100, units="mm",
       bg="white")

gg_placeholder <- ggplot() + 
  theme(plot.margin = margin(t=0, l=0, r=0, b=0, unit = "cm"),
        axis.line = element_blank())

# first row
gg_r1c1 <-  ggpubr::ggarrange(gg_enh_per_gene, gg_genes_per_enhancer_zoom,
                            labels = c("A", NA),
                            ncol = 1, nrow = 2, heights = c(1,1))

gg_r1c2 <-  ggpubr::ggarrange(gg_placeholder, ggplot_ABC_with_marginals_up, ggplot_ABC_with_marginals_down, 
                            labels = c("B", "C", NA),
                            ncol = 3, nrow=1, widths=c(1,1,1))

gg_r1 <- ggpubr::ggarrange(gg_r1c1, gg_r1c2, 
                           labels = c(NA, NA),
                           ncol = 2, nrow=1, widths=c(1,3))

# second row
gg_r2 <-  ggpubr::ggarrange(gg_ABCdiff_log2FC, gg_ABCscore_ofDEgenes_byGRoverlap[[1]], gg_ABCscore_ofDEgenes_byGRoverlap[[2]], 
                            labels = c("D","E", "F"),
                            ncol = 3, nrow=1, widths=c(1,1,1.5))


gg_tophalf <- ggpubr::ggarrange(gg_r1, gg_r2, 
                                nrow=2, heights =c(1,1))
gg_tophalf

full_panel <- ggpubr::ggarrange(gg_tophalf, gg_igv, 
                                labels=c(NA,"G"),
                                nrow=2, heights = c(1.8,1))

#full_panel
ggsave(here("./results/current/Figures/Figure_abcresults.png"), full_panel,
       width=300, height=300, units="mm",
       bg="white")
ggsave(here("./results/current/Figures/Figure_abcresults.pdf"), full_panel,
       width=300, height=300, units="mm",
       bg="white")
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
suppressPackageStartupMessages(library(optparse, warn.conflicts=F, quietly=T))

option_list <- list(
  make_option(c("--ctss_pool1"), 
              type="character",
              help="Path for ctss file from FANTOM5 of pool 1"),
  make_option(c("--ctss_pool2"), 
              type="character",
              help="Path for ctss file from FANTOM5 of pool 2"),
  make_option(c("--liftoverchain"), 
              type="character",
              help="Path to liftover chain file for mm9 to mm10"),
  make_option(c("--gencode_mm9_geneanno"), 
              type="character",
              help="Path to genomic reference file for mm9"),
  make_option(c("--gencode_mm10_geneanno"), 
              type="character",
              help="GENCODE genomic reference for assembly mm10, prefiltered for gene entries"),
  make_option(c("--outdir"), 
              type="character",
              help="Output directory")
)
opt <- parse_args(OptionParser(option_list=option_list))

suppressPackageStartupMessages(library(dplyr, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(BSgenome.Mmusculus.UCSC.mm9, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(ChIPseeker, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(CAGEr, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(rtracklayer, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(ggplot2, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(GenomicRanges, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(here, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(org.Mm.eg.db, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(TxDb.Mmusculus.UCSC.mm10.knownGene, warn.conflicts=F, quietly=T))

outdir <- here(opt$outdir)

# Workflow is based on the CAGEr vignette
# https://www.bioconductor.org/packages/release/bioc/vignettes/CAGEr/inst/doc/CAGEexp.html


#----------------------   Import CAGE samples from BMDMs
#------------------------------------------------------------------------------------

# get URL for public samples, download and read necessariy columns into ctss file (see snakemake workflow)
# We want to import bone-marrow derived macrophage samples through CAGEr.
# After looking at list of available datasets we can decide what samples best fit our needs.
# Let's see what samples are available through FANTOM5
#data(FANTOM5mouseSamples)
#head(FANTOM5mouseSamples)
# The FANTOM5 dataframe holds descriptions of the samples and the url where they can be retrieved.
# There's an easy way to import samples that match a certain term into a CAGEset object.
#mac_samples <- FANTOM5mouseSamples[grep("macrophage, bone marrow derived",
#                                        FANTOM5mouseSamples[,"description"]),]

print("NOTE: reference genome for the public CAGE files is mm9!")

ce <- CAGEr::CAGEexp( genomeName = "BSgenome.Mmusculus.UCSC.mm9",
                      inputFiles = c(opt$ctss_pool1 ,opt$ctss_pool2),
                      inputFilesType = "ctss",
                      sampleLabels   = c("pool1","pool2")
)
# To actually read in the data into the object we use getCTSS() function, that will add an experiment called tagCountMatrix to the CAGEexp object.
ce <- CAGEr::getCTSS(ce)
ce

#------------------------------------------------------------------------------------
#----------------------   QC
#------------------------------------------------------------------------------------

ncbim37_anno <- rtracklayer::import.gff(opt$gencode_mm9_geneanno)

ce <- annotateCTSS(ce, ncbim37_anno)
colData(ce)[,c("librarySizes", "promoter", "exon", "intron", "unknown")]
plotAnnot(ce, "counts")

corr.m <- plotCorrelation2( ce, samples = "all"
                            , tagCountThreshold = 1, applyThresholdBoth = FALSE
                            , method = "pearson")

#------------------------------------------------------------------------------------
#----------------------   Get read clusters
#------------------------------------------------------------------------------------

print("Merging samples")
#Now we can merge them
ce <- mergeSamples(ce, mergeIndex = c(1,1), 
                   mergedSampleLabels = c("BMDM"))

# redo annotation since this gets reset during merging
ce <- annotateCTSS(ce, ncbim37_anno)

print("The total library size is:")
print(librarySizes(ce))

# Check if data follows a power law distribution
plotReverseCumulatives(ce, fitInRange = c(5, 3000), onePlot = TRUE)

print("Normalizing reads")
# Since we don't really care about making comparisons between different population we could prob just skip the normalization
# The fit range is chosen from the plot. We take the alpha from the ref distribution and set T to a million to get the tag count per million (TPM)
ce <- normalizeTagCount(ce, method = "powerLaw", fitInRange = c(5, 3000), alpha = 1.15, T = 1*10^6)
#mac_CAGEset@tagCountMatrix

print("Cluster the tags")
# After normalization we can cluster the tags.
# Clustering, only seems to work with the CAGEset object (due to some problems with the IRanges column)
# From the CAGEr vignette:
# "Transcription start sites are found in the promoter region of a gene and reflect the transcriptional activity of that promoter (Figure 5). TSSs in the close proximity of each other give rise to a functionally equivalent set of transcripts and are likely regulated by the same promoter elements. Thus, TSSs can be spatially clustered into larger transcriptional units, called tag clusters (TCs) that correspond to individual promoters. CAGEr supports three methods for spatial clustering of TSSs along the genome, two ab initio methods driven by the data itself, as well as assigning TSSs to predefined genomic regions:"

ce <- clusterCTSS(ce,
           threshold=1,
           thresholdIsTpm = TRUE,
           nrPassThreshold = 1,
           method="distclu",
           maxDist=20,
           removeSingletons = TRUE,
           keepSingletonsAbove = 3)

# Let's have a look what the result looks like
head(tagClustersGR(ce, sample = "BMDM"))

# calculate cumulative distribution for every tag cluster in each of the samples
ce <- cumulativeCTSSdistribution(ce, clusters = "tagClusters", useMulticore = T)
# determine the positions of selected quantiles
ce <- quantilePositions(ce, clusters = "tagClusters", qLow = 0.1, qUp = 0.9)

# How many tagclusters do we have in total?
length(tagClustersGR(ce, sample = "BMDM"))

# histogram of interquantile width
plotInterquantileWidth(ce, clusters = "tagClusters", tpmThreshold = 3, qLow = 0.1, qUp = 0.9)

print("Retrieving clusters as GenomicRanges")
clusters_gr <- tagClustersGR(ce, sample="BMDM")

#------------------------------------------------------------------------------------
#----------------------   Liftover coordinates to mm10
#------------------------------------------------------------------------------------

print("Liftover to mm10")

# * Now we can lift over the intervals to mm10
# * Annotate them with peakanno
# * pick the most highly expressed one for each gene

liftover <- function(peaks_gr_mm9){ #input is a GenomicRanges object in mm9 coordinates
  #lift peak locations from mm9 to mm10
  chain <- rtracklayer::import.chain(opt$liftoverchain)
  on.exit( close( file(opt$liftoverchain)) )

  peaks_gr_mm10 <- rtracklayer::liftOver(peaks_gr_mm9, chain)
  peaks_gr_mm10 <- GenomicRanges::GRanges(unlist(peaks_gr_mm10))

  return(peaks_gr_mm10)
}

mac_cage_mm10 <- liftover( clusters_gr )
ggplot(as.data.frame(mac_cage_mm10), aes(x=width)) +
  geom_histogram(bins = 100)

# Liftover coordinates of dominant_ctss
dominant_ctss <- liftover( 
  GRanges(
    seqnames = seqnames(clusters_gr), 
    ranges = IRanges(start = clusters_gr$dominant_ctss,
                     end = clusters_gr$dominant_ctss),
    score=clusters_gr$score)
)

#------------------------------------------------------------------------------------
#-----------------   Annotate TSS clusters to reference gene coordinates
#------------------------------------------------------------------------------------
print("Annotate TSS coordinates")
# Use coordinates of the dominant ctss downstream

mac_cage_anno <- ChIPseeker::annotatePeak(dominant_ctss, 
                                          tssRegion=c(-1000, 1000), #more stringent than default
                                          level = "gene",
                                          TxDb=TxDb.Mmusculus.UCSC.mm10.knownGene, 
                                          annoDb = "org.Mm.eg.db")

# For those that are reasonably close to a TSS,
# check for each gene, which position has the highest score.

mac_cage_maxscore <- as.data.frame(mac_cage_anno) %>% 
  filter(abs(distanceToTSS)<=30000) %>%
  mutate(SYMBOL=as.factor(SYMBOL)) %>%
  filter(SYMBOL!="") %>%
  group_by(SYMBOL)%>%
  filter(score == max(score))%>%
  filter(distanceToTSS == min(distanceToTSS )) # for tied score, use shorter distance

nrow(mac_cage_maxscore)

ggplot(mac_cage_maxscore, aes(x=distanceToTSS)) +
  geom_histogram(bins = 100)

#------------------------------------------------------------------------------------
#----------   retrieve gene coordinates and promoterregion from reference
#------------------------------------------------------------------------------------

gencode_mm10_geneanno <- rtracklayer::import.gff(opt$gencode_mm10_geneanno)
genecoords <- as.data.frame(gencode_mm10_geneanno) %>%
  dplyr::select("seqnames","start","end","strand","gene_id") %>%
  mutate(score=0) %>%
  dplyr::mutate(gene_id=gsub("\\.[0-9]*$","",gene_id)) %>%
  dplyr::filter(!"gene_id"=="")%>%
  dplyr::select("seqnames","start","end","gene_id","score","strand")


gencode_mm10_promoterregions <- promoters(gencode_mm10_geneanno)
gencode_mm10_promoterregions <- as.data.frame(gencode_mm10_promoterregions) %>%
  dplyr::select("seqnames","start","end","strand","gene_id") %>%
  mutate(score=0) %>%
  dplyr::mutate(gene_id=gsub("\\.[0-9]*$","",gene_id)) %>%
  dplyr::filter(!"gene_id"=="")%>%
  dplyr::select("seqnames","start","end","gene_id","score","strand")

#------------------------------------------------------------------------------------
#-----------------   export files
#------------------------------------------------------------------------------------

write.table(genecoords, 
            file = paste0(outdir,"reference_genecoords.bed"),
            sep="\t", 
            col.names = FALSE,
            quote=FALSE,
            row.names = FALSE)

write.table(gencode_mm10_promoterregions, 
            file = paste0(outdir,"reference_promoterregions.bed"),
            sep="\t", 
            col.names = FALSE,
            quote=FALSE,
            row.names = FALSE)


rtracklayer::export.bed(as.data.frame(mac_cage_mm10),
                        paste0(outdir, "mac_cage_tssclusterregions.bed"),
                        format="bed")

rtracklayer::export.bed(as.data.frame(dominant_ctss),
                        paste0(outdir, "mac_cage_dominant_ctss.bed"),
                        format="bed")

rtracklayer::export.bed(mac_cage_maxscore,
                        paste0(outdir,"mac_cage_maxscore.bed"),
                        format="bed")
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
suppressPackageStartupMessages(library(optparse, warn.conflicts=F, quietly=T))

option_list <- list(
  make_option(c( "--contrast_DexLPSvLPS"),
              type="character",
              help="Path to annotated tsv file of DeSeq2 contrast of DexLPS vs LPS"),
  make_option(c("--assignment_summit_prox"),
              type="character",
              help="Path to rds file of proximity based assignment of peak summits to genes"),
  make_option(c("--assignment_summits_abcregion_dexlps"),
              type="character",
              help="Path to rds file of assignment of peak summits within abcregions to genes (in DexLPS condition)"),
  make_option(c("--assignment_summits_abcregion_lps"),
              type="character",
              help="Path to rds file of assignment of peak summits within abcregions to genes (in LPS condition)"),
  make_option(c("--assignment_abcregion_dexlps"),
              type="character",
              help="Path to rds file of assignment of abcregions to genes (in DexLPS condition)"),
  make_option(c( "--assignment_abcregion_lps"),
              type="character",
              help="Path to rds file of assignment of abcregions to genes (in LPS condition)"),
  make_option(c("--motifcounts_summitregion"),
              type="character",
              help="Path to rds file of fimo motifcounts within summitregions"),
  make_option(c("--motifcounts_abcregion_dexlps"),
              type="character",
              help="Path to rds file of fimo motifcounts within ABC regions (in DexLPS condition)"),
  make_option(c("--motifcounts_abcregion_lps"),
              type="character",
              help="Path to rds file of fimo motifcounts within ABC regions (in LPS condition)"),
  make_option(c("--model_coefs_joint"),
              type="character",
              help="Path to rds file with model coefficients of joint models"),
  make_option(c("--model_coefs_sep"),
              type="character",
              help="Path to rds file with model coefficients of models tha include enhancers and promoters separately"),
  make_option(c( "--featuredir"),
              type="character",
              help="Path to directory with unscales featurematrizes"),
  make_option(c( "--outfile"),
              type="character",
              help="Path to output file with metrics")
)

opt <- parse_args(OptionParser(option_list=option_list))

dir.create(opt$featuredir)

#change default for stringAsFactors
options(stringsAsFactors = FALSE)

suppressPackageStartupMessages(library(here, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(ggplot2, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(plyranges, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(org.Mm.eg.db, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(DESeq2, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(ComplexHeatmap, warn.conflicts=F, quietly=T))
suppressPackageStartupMessages(library(dplyr, warn.conflicts=F, quietly=T))

#set defaults for ggplot2 figures
theme_update(panel.background = element_rect(fill = "transparent", colour = NA),
             plot.background = element_rect(fill = "transparent", colour = NA),
             legend.background = element_rect(fill = "transparent", colour = NA),
             legend.key = element_rect(fill = "transparent", colour = NA),
             text=element_text(size=6, family = "ArialMT", colour="black"),
             title=element_text(size=8, family="ArialMT", colour="black"),
             panel.grid.major = element_line(colour="grey", size=0.2),
             panel.grid.minor = element_blank(),
             axis.text = element_text(size=6, family="ArialMT", colour="black"),
             axis.line = element_line(colour="black"),
             axis.ticks = element_line(colour="black"),
             legend.key.size = unit(6, 'points'), #change legend key size
             legend.key.height = unit(6, 'points'), #change legend key height
             legend.key.width = unit(6, 'points'), #change legend key width
             legend.text = element_text(size=6, family="ArialMT", colour="black"))

set.seed(12345)

#-------------------------------
## read in  data
#-------------------------------

contrast_DexLPSvLPS <- read.delim(opt$contrast_DexLPSvLPS)

for (optname in names(opt)[2:11]){ #except for the featuredir
  print(paste0("Loading ", optname))
  assign(optname, readRDS( opt[[optname]] ))
}

#--------------------------------------------
## ---- function definitions
#--------------------------------------------

coerce_coef2df <- function(model_coef){
  model_coef <- as.matrix(model_coef)
  model_coef <- as.data.frame(model_coef)
  model_coef$names <- rownames(model_coef)
  colnames(model_coef) <- c("estimates", "names")
  return(model_coef)
}

#------------function will------------:
# * merge the motifdata with gene assignments and 
# * then merge the gene expression changes,
# * filter for DE genes and aggregate per gene
#--------------------------------------
merge_motifdata_with_assignments <- function( motifcounts, assignments, contrast, 
                                              maxonly=FALSE, excludepromoters=FALSE, weightby=FALSE, sepPromEnh=FALSE){

  # check if we should use all abc assignments, or only the max one of each peakID
  if(maxonly==TRUE){
    assignments <- assignments %>% group_by(name) %>% filter(abcscore==max(abcscore)) %>% distinct()
  } else{
    assignments <- assignments
  }

  if(excludepromoters=="all"){
    assignments <- assignments %>% filter(!class=="promoter")
  } else if (excludepromoters=="onlyNONself"){
    assignments <- assignments %>% filter(!c(class=="promoter" & isSelfPromoter=="False"))
  } else{
    assignments <- assignments
  }

  motifdf <- merge(motifcounts, assignments, 
                   by.x="name", by.y="name")

  motifdf <- motifdf %>% relocate(c(anno))

  # merge the expression change
  # use mgi_symbol for prox based, otherwise ensemblID
  # We DONT set all.x=TRUE because we don't care about predicting gene that aren't even expressed or that we don't have a clear label for
  if(maxonly=="prox"){
    motifdf <- merge(motifdf, contrast, 
                     by.x="anno", by.y="mgi_symbol")
  } else {
    motifdf <- merge(motifdf, contrast, 
                     by.x="anno", by.y="Row.names")
  }

  #recode the logFC and padj into a label (optionally through command line arguments)
  motifdf <- motifdf %>% 
    mutate(label=case_when(log2FoldChange>0.58 & padj < 0.05 ~ "up",
                           log2FoldChange<(-0.58) & padj < 0.05 ~ "down",
                           TRUE ~ "no_change")) %>%
    filter(label!="no_change") %>%
    mutate(label=factor(label,
                        levels=c("down","up"),
                        labels=c(0,1))) %>%
    relocate(label)

  # chr 2, 3 and 4 (20%) were used as the tuning set for hyperparameter tuning. 
  # Regions from chromosomes 1, 8 and 9 (20%) were used as the test set for performance evaluation 
  # The remaining regions were used for model training.

  #------aggregate over gene SYMBOL
  if ("abcscore" %in% colnames(motifdf)) { # loop for ABC based assignments

    if (weightby=="abcscore"){
      unselect_col <- "abcnumerator"
    } else if(weightby=="abcnumerator") {
      unselect_col <- "abcscore"
    } else{
      unselect_col <- c("abcscore","abcnumerator")
    }

    if (sepPromEnh==TRUE){
      motifdf_aggr <-
        motifdf %>% 
        dplyr::select(!c(unselect_col,"name","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj","gene_biotype","mgi_symbol")) %>%
        { if(weightby!=FALSE) mutate(.,across(where(is.numeric), ~ (.x * get(weightby)))) else .} %>% # weight features by score
        dplyr::select(!any_of(as.character(weightby))) %>% # then we can drop the score since it will be nonsensical after the aggregation anyways
        group_by(label,seqnames,anno,class) %>%
        dplyr::summarise(across( where(is.numeric), .fns=sum )) %>% # sum up genewise feature counts
        ungroup()

      # From here we need to cast the motifcounts for the promoterregions, so that in the end we have one row per gene (instead of 1-2)
      motifdf_aggr <- motifdf_aggr %>% tidyr::pivot_wider(id_cols=c(label,seqnames,anno), 
                                                          names_from=class, 
                                                          values_from = !c(label,seqnames,anno, class),
                                                          values_fill = 0)
    } else {
      motifdf_aggr <-
        motifdf %>% 
        dplyr::select(!c(unselect_col,"name","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj","gene_biotype","mgi_symbol")) %>%
        { if(weightby!=FALSE) mutate(., across(where(is.numeric), ~ (.x * get(weightby)))) else .} %>% # weight features by score
        dplyr::select(!any_of(as.character(weightby))) %>% # then we can drop the score since it will be nonsensical after the aggregation anyways
        group_by(label,seqnames,anno) %>%
        dplyr::summarise(across( where(is.numeric), .fns=sum )) %>% # sum up genewise feature counts
        ungroup()
    }


  } else { # loop for prox based assignments
    motifdf_aggr <-
      motifdf %>% 
      dplyr::select(!c("name","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj","gene_biotype")) %>% 
      group_by(label,seqnames,anno) %>%
      dplyr::summarise(across( where(is.numeric), .fns=sum )) %>% # sum up genewise feature counts
      ungroup()
  }

  return(motifdf_aggr)

}

get_feature_and_label_metrics <- function(motifdf, trainvalidx, genenames){

  targets_train <- motifdf[ trainvalidx, ] %>% pull(label) %>% as.numeric(levels(.))[.] %>% as.matrix()
  targets_test <- motifdf[ -trainvalidx, ] %>% pull(label) %>% as.numeric(levels(.))[.] %>% as.matrix()

  metrics = data.frame(
    n_input_features = ncol( motifdf[ , -c(1,2,3)]),# first three columns are labels, chromosome and gene annotation
    n_neg_inst_train = table(targets_train)[['0']],
    n_pos_inst_train = table(targets_train)[['1']],
    n_neg_inst_test = table(targets_test)[['0']],
    n_pos_inst_test = table(targets_test)[['1']]
  )

  return(metrics)
}

#------------------------------------------------
## initialize object to gather all metrics
#------------------------------------------------
all_metrics=data.frame(
  n_input_features=numeric(),
  n_neg_inst_train=numeric(),
  n_pos_inst_train=numeric(),
  n_neg_inst_test=numeric(),
  n_pos_inst_test=numeric()
)
#------------------------------------------------
## run proximity based
#------------------------------------------------

#---------------------- 
# This is independent of the ABC results (no need to loop through different assignment variations)

print("Getting metrics of prox-based model")
my_rdsfile <- paste0(opt$featuredir,"prox.rds")
if(!file.exists(my_rdsfile)){

  motifdata_aggr_prox <- merge_motifdata_with_assignments(motifcounts_summitregion,
                                                     assignment_summit_prox,
                                                     contrast_DexLPSvLPS,
                                                     maxonly="prox",
                                                     excludepromoters=FALSE,
                                                     weightby = FALSE,
                                                     sepPromEnh = FALSE)

  saveRDS(motifdata_aggr_prox,file=my_rdsfile)
} else{
  motifdata_aggr_prox <- readRDS(my_rdsfile)
}

motifdata_aggr_prox_scaled <- motifdata_aggr_prox %>% mutate(., across(where(is.numeric), ~(scale(.) %>% as.vector)))
motifdata_aggr_prox_tranval_idx <- motifdata_aggr_prox_scaled %>% with(which(seqnames!="chr1" & seqnames!="chr8" & seqnames!="chr9"))
metrics_prox <- get_feature_and_label_metrics (motifdata_aggr_prox_scaled, motifdata_aggr_prox_tranval_idx, genenames=genenames)

all_metrics <- rbind(all_metrics,
                     "prox"=metrics_prox)


#---------------------- 

not_all_na <- function(x) any(!is.na(x))

for (motifdata in c("motifcounts_abcregion","motifcounts_summitregion")){
  for(excludepromoters in c(FALSE,"all","onlyNONself")){
    for (onlymax in c(TRUE,FALSE)){
      for (sepPromEnh in c(TRUE,FALSE)){
        for (weight in c(FALSE, "abcscore")){

          # set assignments fitting for the input data
          if(motifdata=="motifcounts_abcregion"){
            assignment_dexlps <- assignment_abcregion_dexlps
            assignment_lps <- assignment_abcregion_lps
            motifcounts_dexlps <- motifcounts_abcregion_dexlps
            motifcounts_lps <- motifcounts_abcregion_lps
          } else if (motifdata=="motifcounts_summitregion" ){
            # in this case the motifcounts for the 2 conditions are the same, but there assignments differ
            assignment_dexlps <- assignment_summits_abcregion_dexlps
            assignment_lps <- assignment_summits_abcregion_lps
            motifcounts_dexlps <- motifcounts_summitregion
            motifcounts_lps <- motifcounts_summitregion
          } else {break}

          #-----------------------DEXLPS------------------------------
          modelname <- paste(motifdata,"condition_dexlps_exclprom",excludepromoters,"onlymax",onlymax,"sepPromEnh",sepPromEnh,"weight",weight, sep="_")
          print(modelname)

          my_rdsfile <- here( paste0(opt$featuredir, modelname,".rds"))
          if(!file.exists(my_rdsfile)){
            featurematrix_dexlps <- merge_motifdata_with_assignments(motifcounts_dexlps,
                                                                   assignment_dexlps,
                                                                   contrast_DexLPSvLPS,
                                                                   weightby=weight, 
                                                                   maxonly=onlymax, 
                                                                   excludepromoters=excludepromoters, 
                                                                   sepPromEnh=sepPromEnh)

            saveRDS(featurematrix_dexlps, my_rdsfile)
          } else{
            featurematrix_dexlps <- readRDS(my_rdsfile)
          }
          # remove columns that are NA after scaling
          featurematrix_dexlps_scaled <- featurematrix_dexlps %>%
            mutate(., across(where(is.numeric), ~(scale(.) %>% as.vector))) %>%
            dplyr::select(where(not_all_na))
          motifdata_aggr_tranval_idx <- featurematrix_dexlps_scaled %>% with(which(seqnames!="chr1" & seqnames!="chr8" & seqnames!="chr9"))
          new_metric <- get_feature_and_label_metrics (featurematrix_dexlps_scaled, motifdata_aggr_tranval_idx, genenames=genenames)

          all_metrics <- rbind(all_metrics,
                               new_metric)%>%
            magrittr::set_rownames(c(rownames(all_metrics),modelname))

          #--------------------------LPS---------------------------
          modelname <- paste(motifdata,"condition_lps_exclprom",excludepromoters,"onlymax",onlymax,"sepPromEnh",sepPromEnh,"weight",weight, sep="_")
          print(modelname)

          my_rdsfile <- here(paste0(opt$featuredir, modelname,".rds"))
          if(!file.exists(my_rdsfile)){
            featurematrix_lps <- merge_motifdata_with_assignments(motifcounts_lps,
                                                                assignment_lps,
                                                                contrast_DexLPSvLPS,
                                                                weightby=weight, 
                                                                maxonly=onlymax, 
                                                                excludepromoters=excludepromoters, 
                                                                sepPromEnh=sepPromEnh)

            saveRDS(featurematrix_lps,file=my_rdsfile)
          } else{
            featurematrix_lps <- readRDS(my_rdsfile)
          }
          featurematrix_lps_scaled <- featurematrix_lps %>% 
            mutate(., across(where(is.numeric), ~(scale(.) %>% as.vector))) %>%
            dplyr::select(where(not_all_na))
          motifdata_aggr_tranval_idx <- featurematrix_lps_scaled %>% with(which(seqnames!="chr1" & seqnames!="chr8" & seqnames!="chr9"))
          new_metric <- get_feature_and_label_metrics (featurematrix_lps_scaled, motifdata_aggr_tranval_idx, genenames=genenames)

          all_metrics <- rbind(all_metrics,
                               new_metric)%>%
            magrittr::set_rownames(c(rownames(all_metrics),modelname))

          #-----------------------DIFFERENCE------------------------------

          modelname <- paste(motifdata,"condition_DexLPS-LPS_exclprom",excludepromoters,"onlymax",onlymax,"sepPromEnh",sepPromEnh,"weight",weight, sep="_")
          print(modelname)
          my_rdsfile <- here(paste0(opt$featuredir, modelname,".rds"))
          if(!file.exists(my_rdsfile)){
            # they contain the same motifs, but not the same genes.
            # doublecheck that all columnnames are identical
            table(colnames(featurematrix_dexlps) == colnames(featurematrix_lps))

            #  motifcounts missing in one condition should be set to 0 
            merged_featurematrix <- merge(featurematrix_dexlps,
                                        featurematrix_lps, 
                                        by=c("anno","label","seqnames"),
                                        all=TRUE)
            # replace missing counts in one of the conditions with 0
            merged_featurematrix[is.na(merged_featurematrix)] <- 0

            # use dataframe suffix to grab respective columns
            featurematrix_diff <- 
              merged_featurematrix[grep(".x$",colnames(merged_featurematrix))] - 
              merged_featurematrix[grep(".y$",colnames(merged_featurematrix))] 
            # tidy up column names
            colnames(featurematrix_diff) <- gsub(".x$","", colnames(featurematrix_diff))

            # add first 3 columns back after computing the difference of the counts
            featurematrix_diff <- cbind(merged_featurematrix[1:3],featurematrix_diff)

            saveRDS(featurematrix_diff,file=my_rdsfile)
          } else{
            featurematrix_diff <- readRDS(my_rdsfile)
          }
          featurematrix_diff_scaled <- featurematrix_diff %>% 
            mutate(., across(where(is.numeric), ~(scale(.) %>% as.vector))) %>%
            dplyr::select(where(not_all_na))
          # run GLM and look at performance  
          featurematrix_diff_tranval_idx <- featurematrix_diff_scaled %>% with(which(seqnames!="chr1" & seqnames!="chr8" & seqnames!="chr9"))
          new_metric <- get_feature_and_label_metrics (featurematrix_diff_scaled, featurematrix_diff_tranval_idx, genenames=genenames)

          all_metrics <- rbind(all_metrics,
                               new_metric)%>%
            magrittr::set_rownames(c(rownames(all_metrics),modelname))


        }
      }
    }
  }
}

all_metrics <- all_metrics %>%
  mutate(ratio_pos_inst_train = n_pos_inst_train/(n_pos_inst_train+n_neg_inst_train),
         ratio_pos_inst_test = n_pos_inst_test/(n_pos_inst_test+n_neg_inst_test)
         )

# let'S add the number of non-zero coefficients to this
model_coefs_joint <- readRDS( opt$model_coefs_joint )
model_coefs_sep <- readRDS( opt$model_coefs_sep )

nonzero_columentries <- rbind ( colSums(model_coefs_joint != 0, na.rm=TRUE) %>% as.data.frame(),
                            colSums(model_coefs_sep != 0, na.rm=TRUE) %>% as.data.frame() )
colnames(nonzero_columentries) <- "n_sel_features_inclintercept"

all_metrics <- merge(all_metrics, nonzero_columentries, by="row.names") %>% 
  relocate(n_sel_features_inclintercept, .after = n_input_features )


# parse feature engineering choices from modelname and add them as variables
all_metrics <- all_metrics %>%
  mutate(condition = case_when(grepl("condition_lps", Row.names) ~ "LPS",
                              grepl("condition_DexLPS-LPS", Row.names) ~ "Dex+LPS - LPS",
                              grepl("condition_dexlps", Row.names) ~ "Dex+LPS"),
         input_region = case_when(grepl("motifcounts_summitregion", Row.names) ~ "GR summitregions",
                                  grepl("motifcounts_abcregion", Row.names) ~ "active regions"),
         excludepromoters = case_when(grepl("exclprom_all", Row.names) ~ "all",
                                      grepl("exclprom_onlyNONself", Row.names) ~ "nonself",
                                      grepl("exclprom_FALSE", Row.names) ~ "none"),
         mapping = case_when(grepl("onlymax_FALSE", Row.names) ~ "1-to-many (ABC-based)",
                             grepl("onlymax_TRUE", Row.names) ~ "1-to-1 (ABC-based)"),
         weight = case_when(grepl("weight_abcscore", Row.names) ~ "abcscore",
                             grepl("weight_FALSE", Row.names) ~ "none"),
         prom_and_enh_features = case_when(grepl("sepPromEnh_TRUE", Row.names) ~ "separate",
                                      grepl("sepPromEnh_FALSE", Row.names) ~ "aggregate")

         )
# manually update choices for the reference model
all_metrics <- all_metrics %>%
  rows_update( tibble(Row.names = "prox", 
                      condition="Dex+LPS",
                      input_region="GR summitregions",
                      mapping="1-to-1 (proximity-based)",
                      weight="none"), 
               by = "Row.names")

summary(all_metrics)

summary(all_metrics$ratio_pos_inst_train-all_metrics$ratio_pos_inst_test)

write.table(all_metrics,
            file=opt$outfile,
            quote=FALSE, sep="\t", row.names = FALSE, col.names = TRUE )

Workflow dervied form https://github.com/snakemake-workflows/rna-seq-star-deseq2 for multi condition Deseq2 and enrichment analyis (v0.1.1)

11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
knitr::opts_chunk$set(
  echo = FALSE, warning = FALSE, message = FALSE, dev = "png",
  fig.width = 12, fig.height = 12
)
library(purrr)
library(clusterProfiler)
library(magrittr)
library(ggplot2)
library(org.Mm.eg.db)
library(svglite)
if (exists("snakemake")) {
  gsea_list <- snakemake@input[["gsea_result"]]
  plot_path <- snakemake@params[["plot_path"]]
  c_groups <- snakemake@params[["contrast_groups"]]
  write(unlist(c_groups), file = stderr())
  names(gsea_list) <- names(x = c_groups)
  species_name <- snakemake@config[["organism"]]
  gene_name_type <- snakemake@config[["gene_name_type"]]
  gsea_conf <- snakemake@config[["gsea"]]
  save_plots <- TRUE
} else {
  conf <- yaml::read_yaml("../../../configs/VascAge_Apelin_config.yaml")
  BASE_DIR <- file.path(conf$dirs$BASE_ANALYSIS_DIR)
  cond_id <- names(conf$diffexp$contrasts)[1]
  gsea_list <- as.list(file.path(BASE_DIR, glue::glue("results/diffexp/{cond_id}/{names(conf$diffexp$contrasts[[cond_id]])}.gseares.RDS")))
  c_groups <- names(conf$diffexp$contrasts[[cond_id]])
  names(gsea_list) <- c_groups
  #  gsea_list <- gsea_list[1:3]
  plot_path <- "/home/heyer/tmp/"
  species_name <- "Mus musculus"
  gene_name_type <- "ENSEMBL"
  save_plots <- FALSE
  gsea_conf <- conf$gsea
}
limit <-  10
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
library(tibble)
if (!require(RNAscripts)) {
  devtools::install("./scripts/RNAscripts", upgrade = "never")
}
library(RNAscripts)
if (packageVersion("mitch") < "1.1.8") {
  devtools::install_github("markziemann/mitch")
}
# library(org.Mm.eg.db)
library(mitch)
library(tidyverse)
library(ensembldb)
library(AnnotationHub)
## Load the annotation resource.
ah <- AnnotationHub()
Gtf <- ah["AH28753"]
## Create a EnsDb database file from this.
DbFile <- ensDbFromAH(Gtf)
## We can either generate a database package, or directly load the data
edb <- EnsDb(DbFile)
library(msigdbr)
if (exists("snakemake")) {
  # Input Files
  diffexp_tables_paths <- snakemake@input[["tables"]]
  contrast_list <- snakemake@params[["contrasts"]]
  fpkm_path <- snakemake@input[["fpkm"]]
  # Output Files
  mitch_table <- snakemake@output[["mitch_table"]]
  mitch_rds <- snakemake@output[["mitch_rds"]]
  report_path <- snakemake@output[["mitch_report"]]
  # Parameters
  threads <- snakemake@threads
  enrichment_term_type <- "SYMBOL"
} else {
  BASE_ANALYSIS_DIR <- "/omics/odcf/analysis/OE0228_projects/VascularAging/rna_sequencing/cre_2022/"
  contrast_list <- c(
    "basal_cre_pos_vs_basal_cre_neg", "tumor_cre_pos_vs_tumor_cre_neg", "tumor_cre_pos_vs_basal_cre_pos",
    "tumor_cre_neg_vs_basal_cre_neg"
  )
  diffexp_tables_paths <- as.list(file.path(BASE_ANALYSIS_DIR, glue::glue("results/diffexp/{contrast_list}.diffexp.tsv")))


  fpkm_path <- file.path(BASE_ANALYSIS_DIR, "fpkm/all.tsv")
  threads <- 1
  report_path <- "big_yeeter.html"
  enrichment_term_type <- "SYMBOL"
}

## Read data
diff_exp_tables <- purrr::map(diffexp_tables_paths, readr::read_tsv, col_names = c(
  "gene_id", "baseMean", "logFoldChange",
  "lfcSE", "stat", "pvalue", "padj"
), skip = 1)
names(diff_exp_tables) <- contrast_list
diff_exp_frames <- purrr::map(diff_exp_tables, function(x) {
  gene_ids <- x$gene_id
  diff_exp_fr <- as.data.frame(x[, -1])
  rownames(diff_exp_fr) <- gene_ids
  diff_exp_fr
})
# names(diff_exp_frames) <- contrast_list Read FPKM file
fpkm <- readr::read_tsv(fpkm_path)

match_table <- fpkm %>%
  dplyr::select(c("gene", "gname"))
match_vec <- setNames(object = match_table$gname, nm = match_table$gene)


### Setupt mitch input data
prep_mitch_input <- function(deseq_list, e_term = "ENSEMBL") {
  # mitch_input_df <- mitch::mitch_import(diff_exp_frames, 'DESeq2', geneTable = as.data.frame(fpkm[,c('gene',
  # 'gname')]))
  mitch_input_df <- mitch::mitch_import(deseq_list, "DESeq2")
  gname_tb <- tibble::tibble(Transcript_id = rownames(mitch_input_df))
  if (e_term == "ENSEMBL") {
    gname_tb["gene_id"] <- stringr::str_extract(gname_tb$Transcript_id, "ENSMUSG[0-9]*")
  } else if (e_term == "SYMBOL") {
    gname_tb["gene_id"] <- match_vec[gname_tb$Transcript_id]
  }
  rownames(mitch_input_df) <- gname_tb$gene_id
  colnames(mitch_input_df) <- stringr::str_remove_all(colnames(mitch_input_df), pattern = "-")
  ifelse(anyDuplicated(gname_tb$gene_id), warning("Duplicated genes in mitch input dataframe."), print("no duplicates"))

  mitch_input_df
}
#' Uniquify a vector
#'
#' @param x Input factor to check
#' @return corrected facotr value
#' @examples
#' NULL
uniquify <- function(x) {
  while (anyDuplicated(x)) {
    x[duplicated(x)] <- paste0(x[duplicated(x)], "_1")
  }
  x
}
mitch_input_df <- prep_mitch_input(diff_exp_frames)
# mm_reactome <- buildReactome(output_type = enrichment_term_type) ensembl_reactome <- buildReactome(output_type =
# 'ENSEMBL')


if (enrichment_term_type == "SYMBOL") {
  # Get GENE ids
  new_ids <- ensembldb::select(edb, keys = rownames(mitch_input_df), keytype = "GENEID", columns = c("SYMBOL", "GENEID"))
  update_vector <- setNames(new_ids$SYMBOL, nm = new_ids$GENEID)
  # Stop if any duplicated genes in reactome gene set
  names(match_vec) <- stringr::str_extract(names(match_vec), "ENSMUSG[0-9]*")

  geneIDs <- match_vec[rownames(mitch_input_df)]
  geneIDs[names(update_vector)] <- update_vector


  col_names <- c("gs_name", "gene_symbol")
  mm_msigdbr <- msigdbr::msigdbr(species = "Mus musculus", category = "H") %>%
    dplyr::select(col_names)
  msigdb_gene_list <- split(x = mm_msigdbr$gene_symbol, f = mm_msigdbr$gs_name)
  ensembl_in_msigdb <- new_ids %>%
    dplyr::filter(SYMBOL %in% mm_msigdbr$gene_symbol) %>%
    pull(GENEID)

  ## Cleanup duplicates
  mitch_input_df <- mitch_input_df[(rownames(mitch_input_df) %in% ensembl_in_msigdb) | !(duplicated(geneIDs[rownames(mitch_input_df)]) |
    duplicated(geneIDs[rownames(mitch_input_df)], fromLast = T)), ]

  geneIDs <- match_vec[rownames(mitch_input_df)]
  # geneIDs[names(update_vector)] <- update_vector
  dup_mitch <- mitch_input_df[] %>%
    as_tibble(rownames = "ENSEMBL")
  dup_mitch["SYMBOL"] <- geneIDs[dup_mitch$ENSEMBL]
  # Remove samples which(dup_mitch$ENSEMBL %in% unlist(ensembl_reactome))
  final_name_vec <- setNames(dup_mitch$SYMBOL, nm = dup_mitch$ENSEMBL)
  final_name_vec <- final_name_vec[-which(!(final_name_vec %in% mm_msigdbr$gene_symbol) & duplicated(final_name_vec))]
  # final_name_vec['ENSMUSG00000051396'] <- 'Gm45902'
  if (anyDuplicated(final_name_vec)) {
    stop("Duplicate in gene names")
  }
  # geneIDs <- uniquify(geneIDs)
  mitch_input_df <- mitch_input_df[names(final_name_vec), ]
  rownames(mitch_input_df) <- final_name_vec
}

## Setup Databases to test msig_hallmarck <- msigdbr::msigdbr(species = 'Mus musculus')

# bain <- msig_hallmarck %>% dplyr::select(c('gs_name', 'gene_symbol')) %>% group_split(gs_name)

# more_bain <- purrr::map(bain, function(x){x$gene_symbol})

# mitch::mitch_calc(mitch_input_df, more_bain, cores = 6, priority ='significance')

# mm_reactome <- buildReactome(output_type = enrichment_term_type) print(colnames(mitch_input_df))
# print(anyDuplicated(colnames(mitch_input_df)))
reac_test <- mitch::mitch_calc(mitch_input_df, msigdb_gene_list, cores = threads)

readr::write_tsv(reac_test$enrichment_result, mitch_table)
saveRDS(reac_test, file = mitch_rds)

mitch::mitch_report(reac_test, outfile = report_path)

Snakemake pipeline for Gene Set Enrichment: pathway and GO enrichment analysis

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
library("edgeR")
library(biomaRt)
library(dplyr)
library("AnnotationDbi")
library("org.Hs.eg.db")
library(data.table)
library("gplots")
library(magrittr)
library(pathview)
library("org.Mm.eg.db")
library(gage)
library(gageData)
library(data.table)


args <- commandArgs(trailingOnly = TRUE)
dge = args[1]
ORGANISM  =args[2] 
compare_type = args[3]
outname = args[4]

res <- read.csv(file = dge, header = TRUE)
summary(res) 
foldchanges = res$logFC
names(foldchanges) = res$entrez
outname = paste(outname, compare_type, sep ="-")
outGO = paste(outname, "GO.csv", sep ="-")

if (ORGANISM == "HUMAN")
{
data(kegg.sets.hs)
data(go.sets.hs)
data(carta.hs)
data(sigmet.idx.hs)
data(go.subs.hs)
kegg.sets.hs = kegg.sets.hs[sigmet.idx.hs]
head(kegg.sets.hs,3)
#GO
data(go.sets.hs)
data(go.subs.hs)
gobpsets = go.sets.hs[go.subs.hs$BP]
gobpres = gage(foldchanges, gsets=kegg.sets.hs, same.dir =TRUE, compare =compare_type,cutoff=0.05)
lapply(gobpres, head)
write.csv(gobpres, outGO)

} else if (ORGANISM == "MOUSE"){
data(kegg.sets.mm)
data(go.sets.mm)
data(carta.mm)
data(sigmet.idx.mm)
data(go.subs.mm)


kegg.sets.mm = kegg.sets.mm[sigmet.idx.mm]
head(kegg.sets.mm,3)
data(go.sets.mm)
data(go.subs.mm)
gobpsets = go.sets.mm[go.subs.mm$BP]
gobpres = gage(foldchanges, gsets=kegg.sets.mm, same.dir =TRUE, compare =compare_type,cutoff=0.05)
lapply(gobpres, head)
write.csv(gobpres, outGO)
} 
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
library("edgeR")
library(biomaRt)
library(dplyr)
library("AnnotationDbi")
library("org.Hs.eg.db")
library(data.table)
library("gplots")
library(magrittr)
library(pathview)
library("org.Mm.eg.db")
library(gage)
library(gageData)
library(data.table)


args <- commandArgs(trailingOnly = TRUE)
dge = args[1]
ORGANISM  =args[2] 
compare_type = args[3]
outname = args[4]
outname 

res <- read.csv(file = dge, header = TRUE)
outname = paste(outname, compare_type, sep ="-")
out = paste(outname, "KEGG.csv", sep="-")
outUP = paste(outname, "KEGG_UP.csv", sep ="-")
outDOWN = paste(outname, "KEGG_DOWN.csv", sep ="-")

foldchanges = res$logFC
names(foldchanges) <- res$entrez
summary(foldchanges)
#---------------------------
#KEGG Analysis 
#---------------------------
if (ORGANISM == "HUMAN")
{
data(kegg.sets.hs)
data(go.sets.hs)
data(carta.hs)
data(sigmet.idx.hs)
data(go.subs.hs)
kegg.sets.hs = kegg.sets.hs[sigmet.idx.hs]
head(kegg.sets.hs,3)
#---------------------------------------------------Use Kegg and gage to get upregulated and downregulated pathways
data(kegg.gs)
keggres = gage(foldchanges, gsets =kegg.sets.hs, same.dir = TRUE, compare=compare_type,make.plot = TRUE)
lapply(keggres, head)
write.csv(keggres,out)
keggrespathwaysup = data.frame(id=rownames(keggres$greater), keggres$greater) %>%
  tbl_df() %>%
  filter(row_number()<=30) %>%
  .$id %>%
  as.character()
keggrespathwaysdn = data.frame(id=rownames(keggres$less), keggres$less) %>%
  tbl_df() %>%
  filter(row_number()<=30) %>%
  .$id %>%
  as.character()
write.csv(keggrespathwaysup, outUP)
write.csv(keggrespathwaysdn, outDOWN)
#-------------------------------------------------------------------------------------------------------------------------------
keggresidsup = substr(keggrespathwaysup, start=1, stop=8)
keggresidsup
keggresidsdn = substr(keggrespathwaysdn, start=1, stop=8)
#gobpres = gage(foldchanges, gsets=kegg.sets.hs, same.dir =FALSE, compare ="paired",cutoff=0.05)
#lapply(gobpres, head)
#---------------------------------------------------------Define plotting function for applying later
plot_pathway = function(pid) pathview(gene.data=foldchanges,gene.idtype="ENTREZID", pathway.id=pid, species="human", new.signature=FALSE)
#---------------------------------------------------------plot multiple pathways ups and downs 
tmpup = sapply(keggresidsup, function(pid) pathview(gene.data=foldchanges,gene.idtype="ENTREZID", pathway.id=pid, species="human"))
tmpdn = sapply(keggresidsdn, function(pid) pathview(gene.data=foldchanges,gene.idtype="ENTREZID", pathway.id=pid, species="human"))
} else if (ORGANISM == "MOUSE"){
data(kegg.sets.mm)
data(go.sets.mm)
data(carta.mm)
data(sigmet.idx.mm)
data(go.subs.mm)
kegg.sets.mm = kegg.sets.mm[sigmet.idx.mm]
head(kegg.sets.mm,3)
#---------------------------------------------------Use Kegg and gage to get upregulated and downregulated pathways
data(kegg.gs)
keggres = gage(foldchanges, gsets =kegg.sets.mm, same.dir = TRUE, compare=compare_type,make.plot = TRUE)
lapply(keggres, head)
write.csv(keggres,out)
keggrespathwaysup = data.frame(id=rownames(keggres$greater), keggres$greater) %>%
  tbl_df() %>%
  filter(row_number()<=30) %>%
  .$id %>%
  as.character()
keggrespathwaysdn = data.frame(id=rownames(keggres$less), keggres$less) %>%
  tbl_df() %>%
  filter(row_number()<=30) %>%
  .$id %>%
  as.character()
write.csv(keggrespathwaysup, outUP)
write.csv(keggrespathwaysdn, outDOWN)
#-------------------------------------------------------------------------------------------------------------------------------
keggresidsup = substr(keggrespathwaysup, start=1, stop=8)
keggresidsup
keggresidsdn = substr(keggrespathwaysdn, start=1, stop=8)
#gobpres = gage(foldchanges, gsets=kegg.sets.mm, same.dir =FALSE, compare ="paired",cutoff=0.05)
#lapply(gobpres, head)
#---------------------------------------------------------Define plotting function for applying later
plot_pathway = function(pid) pathview(gene.data=foldchanges,gene.idtype="ENTREZID", pathway.id=pid, species="mouse", new.signature=FALSE)
#---------------------------------------------------------plot multiple pathways ups and downs 
tmpup = sapply(keggresidsup, function(pid) pathview(gene.data=foldchanges,gene.idtype="ENTREZID", pathway.id=pid, species="mouse"))
tmpdn = sapply(keggresidsdn, function(pid) pathview(gene.data=foldchanges,gene.idtype="ENTREZID", pathway.id=pid, species="mouse"))
} 

Differential Gene Expression using RNA-Seq

 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
library(knitr)
knitr::opts_chunk$set(echo = TRUE)
library(DESeq2)
library(ggplot2) 
library(knitr)
library(clusterProfiler)
library(biomaRt)
library(ReactomePA)
library(DOSE)
library(KEGG.db)
library(org.Mm.eg.db)
library(org.Hs.eg.db)
library(pheatmap)
library(genefilter)
library(RColorBrewer)
library(GO.db)
library(topGO)
library(dplyr)
library(gage)
library(ggsci)
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
source("https://bioconductor.org/biocLite.R")
biocLite("DESeq2") ; library(DESeq2)
biocLite("ggplot2") ; library(ggplot2)
biocLite("clusterProfiler") ; library(clusterProfiler)
biocLite("biomaRt") ; library(biomaRt)
biocLite("ReactomePA") ; library(ReactomePA)
biocLite("DOSE") ; library(DOSE)
biocLite("KEGG.db") ; library(KEGG.db)
biocLite("org.Mm.eg.db") ; library(org.Mm.eg.db)
biocLite("org.Hs.eg.db") ; library(org.Hs.eg.db)
biocLite("pheatmap") ; library(pheatmap)
biocLite("genefilter") ; library(genefilter)
biocLite("RColorBrewer") ; library(RColorBrewer)
biocLite("GO.db") ; library(GO.db)
biocLite("topGO") ; library(topGO)
biocLite("dplyr") ; library(dplyr)
biocLite("gage") ; library(gage)
biocLite("ggsci") ; library(ggsci)
data / bioconductor

org.Mm.eg.db

Genome wide annotation for Mouse: Genome wide annotation for Mouse, primarily based on mapping using Entrez Gene identifiers.