Workflow Steps and Code Snippets
11 tagged steps and code snippets that match keyword org.Mm.eg.db
Code for the manuscript "Machine learning reveals STAT motifs as predictors for GR-mediated gene repression"
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 | suppressPackageStartupMessages(library(optparse, warn.conflicts=F, quietly=T)) option_list <- list( make_option(c( "--contrast_DexLPSvLPS"), type="character", help="Path to annotated tsv file of DeSeq2 contrast of DexLPS vs LPS"), make_option(c("--assignment_summit_prox"), type="character", help="Path to rds file of proximity based assignment of peak summits to genes"), make_option(c("--assignment_summits_abcregion_dexlps"), type="character", help="Path to rds file of assignment of peak summits within abcregions to genes (in DexLPS condition)"), make_option(c("--assignment_summits_abcregion_lps"), type="character", help="Path to rds file of assignment of peak summits within abcregions to genes (in LPS condition)"), make_option(c("--assignment_abcregion_dexlps"), type="character", help="Path to rds file of assignment of abcregions to genes (in DexLPS condition)"), make_option(c( "--assignment_abcregion_lps"), type="character", help="Path to rds file of assignment of abcregions to genes (in LPS condition)"), make_option(c("--motifcounts_summitregion"), type="character", help="Path to rds file of fimo motifcounts within summitregions"), make_option(c("--motifcounts_abcregion_dexlps"), type="character", help="Path to rds file of fimo motifcounts within ABC regions (in DexLPS condition)"), make_option(c("--motifcounts_abcregion_lps"), type="character", help="Path to rds file of fimo motifcounts within ABC regions (in LPS condition)"), make_option(c( "--outdir"), type="character", help="Path to output directory") ) opt <- parse_args(OptionParser(option_list=option_list)) dir.create(opt$outdir) #change default for stringAsFactors options(stringsAsFactors = FALSE) suppressPackageStartupMessages(library(here, warn.conflicts=F, quietly=T)) suppressPackageStartupMessages(library(ggplot2, warn.conflicts=F, quietly=T)) suppressPackageStartupMessages(library(plyranges, warn.conflicts=F, quietly=T)) suppressPackageStartupMessages(library(org.Mm.eg.db, warn.conflicts=F, quietly=T)) suppressPackageStartupMessages(library(DESeq2, warn.conflicts=F, quietly=T)) suppressPackageStartupMessages(library(ComplexHeatmap, warn.conflicts=F, quietly=T)) suppressPackageStartupMessages(library(dplyr, warn.conflicts=F, quietly=T)) #set defaults for ggplot2 figures theme_update(panel.background = element_rect(fill = "transparent", colour = NA), plot.background = element_rect(fill = "transparent", colour = NA), legend.background = element_rect(fill = "transparent", colour = NA), legend.key = element_rect(fill = "transparent", colour = NA), text=element_text(size=6, family = "ArialMT", colour="black"), title=element_text(size=8, family="ArialMT", colour="black"), panel.grid.major = element_line(colour="grey", size=0.2), panel.grid.minor = element_blank(), axis.text = element_text(size=6, family="ArialMT", colour="black"), axis.line = element_line(colour="black"), axis.ticks = element_line(colour="black"), legend.key.size = unit(6, 'points'), #change legend key size legend.key.height = unit(6, 'points'), #change legend key height legend.key.width = unit(6, 'points'), #change legend key width legend.text = element_text(size=6, family="ArialMT", colour="black")) set.seed(12345) #------------------------------- ## read in data #------------------------------- contrast_DexLPSvLPS <- read.delim(opt$contrast_DexLPSvLPS) for (optname in names(opt)[2:9]){ #except for the outdir print(paste0("Loading ", optname)) assign(optname, readRDS( opt[[optname]] )) } #-------------------------------------------- ## ---- function definitions #-------------------------------------------- coerce_coef2df <- function(model_coef){ model_coef <- as.matrix(model_coef) model_coef <- as.data.frame(model_coef) model_coef$names <- rownames(model_coef) colnames(model_coef) <- c("estimates", "names") return(model_coef) } #------------function will------------: # * merge the motifdata with gene assignments and # * then merge the gene expression changes, # * filter for DE genes and aggregate per gene #-------------------------------------- merge_motifdata_with_assignments <- function( motifcounts, assignments, contrast, maxonly=FALSE, excludepromoters=FALSE, weightby=FALSE, sepPromEnh=FALSE){ # check if we should use all abc assignments, or only the max one of each peakID if(maxonly==TRUE){ assignments <- assignments %>% group_by(name) %>% filter(abcscore==max(abcscore)) %>% distinct() } else{ assignments <- assignments } if(excludepromoters=="all"){ assignments <- assignments %>% filter(!class=="promoter") } else if (excludepromoters=="onlyNONself"){ assignments <- assignments %>% filter(!c(class=="promoter" & isSelfPromoter=="False")) } else{ assignments <- assignments } motifdf <- merge(motifcounts, assignments, by.x="name", by.y="name") motifdf <- motifdf %>% relocate(c(anno)) # merge the expression change # use mgi_symbol for prox based, otherwise ensemblID # We DONT set all.x=TRUE because we don't care about predicting gene that aren't even expressed or that we don't have a clear label for if(maxonly=="prox"){ motifdf <- merge(motifdf, contrast, by.x="anno", by.y="mgi_symbol") } else { motifdf <- merge(motifdf, contrast, by.x="anno", by.y="Row.names") } #recode the logFC and padj into a label (optionally through command line arguments) motifdf <- motifdf %>% mutate(label=case_when(log2FoldChange>0.58 & padj < 0.05 ~ "up", log2FoldChange<(-0.58) & padj < 0.05 ~ "down", TRUE ~ "no_change")) %>% filter(label!="no_change") %>% mutate(label=factor(label, levels=c("down","up"), labels=c(0,1))) %>% relocate(label) # chr 2, 3 and 4 (20%) were used as the tuning set for hyperparameter tuning. # Regions from chromosomes 1, 8 and 9 (20%) were used as the test set for performance evaluation # The remaining regions were used for model training. #------aggregate over gene SYMBOL if ("abcscore" %in% colnames(motifdf)) { # loop for ABC based assignments if (weightby=="abcscore"){ unselect_col <- "abcnumerator" } else if(weightby=="abcnumerator") { unselect_col <- "abcscore" } else{ unselect_col <- c("abcscore","abcnumerator") } if (sepPromEnh==TRUE){ motifdf_aggr <- motifdf %>% dplyr::select(!c(unselect_col,"name","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj","gene_biotype","mgi_symbol")) %>% { if(weightby!=FALSE) mutate(.,across(where(is.numeric), ~ (.x * get(weightby)))) else .} %>% # weight features by score dplyr::select(!any_of(as.character(weightby))) %>% # then we can drop the score since it will be nonsensical after the aggregation anyways group_by(label,seqnames,anno,class) %>% dplyr::summarise(across( where(is.numeric), .fns=sum )) %>% # sum up genewise feature counts ungroup() # From here we need to cast the motifcounts for the promoterregions, so that in the end we have one row per gene (instead of 1-2) motifdf_aggr <- motifdf_aggr %>% tidyr::pivot_wider(id_cols=c(label,seqnames,anno), names_from=class, values_from = !c(label,seqnames,anno, class), values_fill = 0) } else { motifdf_aggr <- motifdf %>% dplyr::select(!c(unselect_col,"name","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj","gene_biotype","mgi_symbol")) %>% { if(weightby!=FALSE) mutate(., across(where(is.numeric), ~ (.x * get(weightby)))) else .} %>% # weight features by score dplyr::select(!any_of(as.character(weightby))) %>% # then we can drop the score since it will be nonsensical after the aggregation anyways group_by(label,seqnames,anno) %>% dplyr::summarise(across( where(is.numeric), .fns=sum )) %>% # sum up genewise feature counts ungroup() } } else { # loop for prox based assignments motifdf_aggr <- motifdf %>% dplyr::select(!c("name","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj","gene_biotype")) %>% group_by(label,seqnames,anno) %>% dplyr::summarise(across( where(is.numeric), .fns=sum )) %>% # sum up genewise feature counts ungroup() } return(motifdf_aggr) } #------------function needs------------: # featurematrix and indexes to perform split of test vs trainval set # (trainval set is used for CV to find optimal regularization) #------------function will------------: # run cv.glmnet (as elastic net regression) and pick lambda.1se as regularization # determine model performance on test set # plots for raw counts of certain factors are currently turned of complete_GLM_analysis <- function(motifdf, trainvalidx, genenames){ # Create training subset for model development & testing set for model performance testing # make it based on chromosomes, instead of completely random # as in bpnet, we set aside chr 1,8 and 9 for testing #inTrain <- sort(sample(nrow(motifdata_abc_aggr), nrow(motifdata_abc_aggr)*0.75)) features_train <- motifdf[ trainvalidx, -c(1,2,3)] %>% as.matrix() features_test <- motifdf[ -trainvalidx, -c(1,2,3)] %>% as.matrix() targets_train <- motifdf[ trainvalidx, ] %>% pull(label) %>% as.numeric(levels(.))[.] %>% as.matrix() targets_test <- motifdf[ -trainvalidx, ] %>% pull(label) %>% as.numeric(levels(.))[.] %>% as.matrix() #------------------------ ## Elastic net #------------------------ cvfit_net <- glmnet::cv.glmnet(x=features_train, y=targets_train, family="binomial", type.measure = "auc", nfolds = 6, alpha=0.5) # Performance on training data targets_train_net.prob <- predict(cvfit_net, type="response", newx = features_train, s = 'lambda.min') pred_train_net <- ROCR::prediction(targets_train_net.prob[,1], targets_train) #only need the probabilities for 1's auc_ROCR_train_net <- ROCR::performance(pred_train_net, measure = "auc") #to assess AUC for model auc_ROCR_train_net@y.values[[1]] # Predict on unseen data targets_net.prob <- predict(cvfit_net, type="response", newx = features_test, s = 'lambda.min') pred_net <- ROCR::prediction(targets_net.prob[,1], targets_test) #only need the probabilities for 1's auc_ROCR_net <- ROCR::performance(pred_net, measure = "auc") #to assess AUC for model auc_ROCR_net@y.values[[1]] #------------------------ ## Model coefficients #------------------------ coef_df_net <- coerce_coef2df( coefficients(cvfit_net, s = 'lambda.min')) gg_net <- ggplot( data=coef_df_net %>% filter(abs(estimates)>0) )+ geom_bar(aes(x=reorder(names, -estimates), y=estimates), stat="identity")+ labs(title=" ",x="", y="Net coefs")+ #coord_flip()+ theme(axis.text.x = element_text(angle = 45, hjust=1), plot.margin = unit(c(0, 0, -6, 10), "points")) #------------------------ # ROC curves #------------------------ # calculate probabilities for TPR/FPR for predictions perf_train_net <- ROCR::performance(pred_train_net,"tpr","fpr") perf_net <- ROCR::performance(pred_net,"tpr","fpr") # plot ROC curve gg_ROC <- ggplot()+ geom_line( aes(x=perf_train_net@x.values[[1]], y=perf_train_net@y.values[[1]], colour = "train_net") ) + geom_line( aes(x=perf_net@x.values[[1]],y=perf_net@y.values[[1]], colour = "test_net") ) + geom_abline(intercept=0,slope=1, linetype=4, colour="grey")+ scale_colour_manual(values=c("darkblue", "blue"), name=" ", breaks=c("train_net", "test_net"), labels=c(paste("net train. AUC:",round( auc_ROCR_train_net@y.values[[1]], 2)) , paste("net test. AUC:", round( auc_ROCR_net@y.values[[1]], 2)) ) )+ labs(x="False positive rate", y="True positive rate")+ theme( legend.position=c(0.65,0.15) ) # add metric to global variable new_metrics = data.frame( net_train = auc_ROCR_train_net@y.values[[1]], net_test = auc_ROCR_net@y.values[[1]] ) full_panel <- ggpubr::ggarrange(gg_ROC,gg_net, nrow=2, labels=c("A","B"), heights = c(1,1)) results <- list() results[[1]] <- full_panel results[[2]] <- new_metrics results[[3]] <- coef_df_net results[[4]] <- cvfit_net return(results) } #------------------------------------------------ ## initialize object to track performance metrics #------------------------------------------------ AUC_metrics = data.frame( # parameter combination condition=character(), motifdata=character(), excludepromoters=character(), onlymax=character(), weight=character(), sepPromEnh=character(), # performance net_train = numeric(), net_test = numeric() ) net_model_coefs = list() #------------------------------------------------ ## run proximity based #------------------------------------------------ # This is independent of the ABC results (no need to loop through different assignment variations) print("Running GLM on prox-based assignments") my_rdsfile <- paste0(opt$outdir,"prox.rds") if(!file.exists(my_rdsfile)){ motifdata_aggr <- merge_motifdata_with_assignments(motifcounts_summitregion, assignment_summit_prox, contrast_DexLPSvLPS, maxonly="prox", excludepromoters=FALSE, weightby = FALSE, sepPromEnh = FALSE) motifdata_aggr_scaled <- motifdata_aggr %>% mutate(., across(where(is.numeric), ~(scale(.) %>% as.vector))) motifdata_aggr_tranval_idx <- motifdata_aggr_scaled %>% with(which(seqnames!="chr1" & seqnames!="chr8" & seqnames!="chr9")) performance <- complete_GLM_analysis (motifdata_aggr_scaled, motifdata_aggr_tranval_idx, genenames=genenames) #------------------------ ## Raw counts #------------------------ nr3c1counts <- motifdata_aggr %>% group_by(label) %>% dplyr::count(NR3C1) %>% mutate(counts_fac = case_when(NR3C1>=1 ~ ">=1", NR3C1==0 ~ "0"), # aggregate it towards to top end counts_fac = factor(as.character(counts_fac), levels=c("0",">=1")) ) %>% group_by(counts_fac) %>% mutate(freq = n / sum(n)) %>% group_by(label,counts_fac) %>% dplyr::summarize(sum_n=sum(n), sum_freq=sum(freq)) gg_nr3c1counts <- ggplot(data=nr3c1counts , aes(x=counts_fac,y=sum_freq,fill=label)) + geom_bar(stat="identity", position="dodge", alpha=0.5)+ geom_text(aes(label=format(sum_freq, digits=2)), position = position_dodge(.9), vjust = -0.5, size = 3) + scale_fill_manual(values=c("0"="blue", "1"="red"), labels=c("DOWN","UP"))+ labs(x="NR3C1 matches", y="% genes", fill="")+ theme( plot.margin = unit(c(10, 6, 0, 10), "points") ) gg_nr3c1counts relcounts <- motifdata_aggr %>% group_by(label) %>% dplyr::count(REL) %>% mutate(counts_fac = case_when(REL>=1 ~ ">=1", REL==0 ~ "0"), # aggregate it towards to top end counts_fac = factor(as.character(counts_fac), levels=c("0",">=1")) ) %>% group_by(counts_fac) %>% mutate(freq = n / sum(n)) %>% group_by(label,counts_fac) %>% dplyr::summarize(sum_n=sum(n), sum_freq=sum(freq)) gg_relcounts <- ggplot(data=relcounts , aes(x=counts_fac,y=sum_freq,fill=label)) + geom_bar(stat="identity", position="dodge", alpha=0.5)+ geom_text(aes(label=format(sum_freq, digits=2)), position = position_dodge(.9), vjust = -0.5, size = 3) + scale_fill_manual(values=c("0"="blue", "1"="red"), labels=c("DOWN","UP"))+ labs(x="REL matches", y="% genes", fill="")+ theme( plot.margin = unit(c(10, 6, 0, 10), "points") ) gg_relcounts gg_rawcounts <- ggpubr::ggarrange( gg_nr3c1counts, gg_relcounts, ncol=2, common.legend = TRUE, legend="bottom") saveRDS(gg_rawcounts, paste0(opt$outdir,"gg_rawcounts.rds")) #------------------------ saveRDS(performance, file=my_rdsfile) } else { performance <- readRDS(my_rdsfile) } plot(performance[[1]]) AUC_metrics <- rbind(AUC_metrics, "prox" = c( condition="dexlps", motifdata="motifcounts_summitregion", excludepromoters=FALSE, onlymax="prox", weight=FALSE, sepPromEnh=FALSE, performance[[2]] ) ) net_model_coefs[["prox"]] <- performance[[3]] ## Raw counts gg_firstgene <- ggplot() + geom_bar(data=motifdata_aggr , aes(NR3C1,fill=label), position="dodge", alpha=0.5)+ scale_fill_manual(values=c("0"="blue", "1"="red"), labels=c("DOWN","UP"))+ labs(x="NR3C1", y="counts", fill="")+ theme( plot.margin = unit(c(10, 6, 0, 10), "points") ) gg_firstgene #------------------------------------------------ ## run example #------------------------------------------------ # motifdata_aggr <- merge_motifdata_with_assignments( # motifcounts_summitregion, # assignment_summitregion_abc_dexlps, # contrast_DexLPSvLPS, # maxonly=FALSE, # excludepromoters=FALSE, # weightby=FALSE, # sepPromEnh=FALSE) # # motifdata_aggr_tranval_idx <- motifdata_aggr %>% with(which(seqnames!="chr1" & seqnames!="chr8" & seqnames!="chr9")) # performance <- complete_GLM_analysis (motifdata_aggr, motifdata_aggr_tranval_idx, genenames=genenames) # plot(performance[[1]]) # AUC_metrics <- rbind(AUC_metrics, # "test" = c(condition="dexlps", # motifdata="motifcounts_peak", # onlymax=FALSE, # excludepromoters=FALSE, # weight=FALSE, # sepPromEnh=FALSE, # performance[[2]])) # net_model_coefs[["test"]] <- performance[[3]] # # gg_firstgene <- ggplot() + # geom_bar(data=performance[[5]] , aes(NR3C1,fill=label), position="dodge", alpha=0.5)+ # scale_fill_manual(values=c("0"="blue", "1"="red"), # labels=c("DOWN","UP"))+ # labs(x="NR3C1", y="counts", fill="")+ # theme( # plot.margin = unit(c(10, 6, 0, 10), "points") # ) # gg_firstgene #------------------------------------------------ ## peakregion_counts - proxbased assignment ## peakregion_counts - abcbased assignment (both conditions) ## enhancerregion_counts - abcbased assignment (2 conditions) #------------------------------------------------ # rewrite to assign values for both conditions and then use those to compute the difference as well not_all_na <- function(x) any(!is.na(x)) for (motifdata in c("motifcounts_abcregion","motifcounts_summitregion")){ for(excludepromoters in c(FALSE,"all","onlyNONself")){ for (onlymax in c(TRUE,FALSE)){ for (sepPromEnh in c(TRUE,FALSE)){ for (weight in c(FALSE, "abcscore")){ # set assignments fitting for the input data if(motifdata=="motifcounts_abcregion"){ assignment_dexlps <- assignment_abcregion_dexlps assignment_lps <- assignment_abcregion_lps motifcounts_dexlps <- motifcounts_abcregion_dexlps motifcounts_lps <- motifcounts_abcregion_lps } else if (motifdata=="motifcounts_summitregion" ){ # in this case the motifcounts for the 2 conditions are the same, but there assignments differ assignment_dexlps <- assignment_summits_abcregion_dexlps assignment_lps <- assignment_summits_abcregion_lps motifcounts_dexlps <- motifcounts_summitregion motifcounts_lps <- motifcounts_summitregion } else {break} #-----------------------DEXLPS------------------------------ featurematrix_dexlps <- merge_motifdata_with_assignments(motifcounts_dexlps, assignment_dexlps, contrast_DexLPSvLPS, weightby=weight, maxonly=onlymax, excludepromoters=excludepromoters, sepPromEnh=sepPromEnh) # remove columns that are NA after scaling featurematrix_dexlps_scaled <- featurematrix_dexlps %>% mutate(., across(where(is.numeric), ~(scale(.) %>% as.vector))) %>% dplyr::select(where(not_all_na)) #----- motifdata_aggr_tranval_idx <- featurematrix_dexlps_scaled %>% with(which(seqnames!="chr1" & seqnames!="chr8" & seqnames!="chr9")) modelname <- paste(motifdata,"condition_dexlps_exclprom",excludepromoters,"onlymax",onlymax,"sepPromEnh",sepPromEnh,"weight",weight, sep="_") print(modelname) my_rdsfile <- here( paste0(opt$outdir, modelname,".rds")) if(!file.exists(my_rdsfile)){ performance <- complete_GLM_analysis (featurematrix_dexlps_scaled, motifdata_aggr_tranval_idx, genenames=genenames) saveRDS(performance,file=my_rdsfile) } else { performance <- readRDS(my_rdsfile) } AUC_metrics <- rbind(AUC_metrics, c( condition="dexlps", motifdata=motifdata, excludepromoters=excludepromoters, onlymax=onlymax, weight=weight, sepPromEnh=sepPromEnh, performance[[2]] ) )%>% magrittr::set_rownames(c(rownames(AUC_metrics),modelname)) net_model_coefs[[modelname]] <- performance[[3]] #--------------------------LPS--------------------------- featurematrix_lps <- merge_motifdata_with_assignments(motifcounts_lps, assignment_lps, contrast_DexLPSvLPS, weightby=weight, maxonly=onlymax, excludepromoters=excludepromoters, sepPromEnh=sepPromEnh) featurematrix_lps_scaled <- featurematrix_lps %>% mutate(., across(where(is.numeric), ~(scale(.) %>% as.vector))) %>% dplyr::select(where(not_all_na)) #----- motifdata_aggr_tranval_idx <- featurematrix_lps_scaled %>% with(which(seqnames!="chr1" & seqnames!="chr8" & seqnames!="chr9")) modelname <- paste(motifdata,"condition_lps_exclprom",excludepromoters,"onlymax",onlymax,"sepPromEnh",sepPromEnh,"weight",weight, sep="_") print(modelname) my_rdsfile <- here(paste0(opt$outdir, modelname,".rds")) if(!file.exists(my_rdsfile)){ performance <- complete_GLM_analysis (featurematrix_lps_scaled, motifdata_aggr_tranval_idx, genenames=genenames) saveRDS(performance,file=my_rdsfile) } else { performance <- readRDS(my_rdsfile) } AUC_metrics <- rbind(AUC_metrics, c( condition="lps", motifdata=motifdata, excludepromoters=excludepromoters, onlymax=onlymax, weight=weight, sepPromEnh=sepPromEnh, performance[[2]] ) )%>% magrittr::set_rownames(c(rownames(AUC_metrics),modelname)) net_model_coefs[[modelname]] <- performance[[3]] #-----------------------DIFFERENCE------------------------------ # they contain the same motifs, but not the same genes. # doublecheck that all columnnames are identical table(colnames(featurematrix_dexlps) == colnames(featurematrix_lps)) # motifcounts missing in one condition should be set to 0 merged_featurematrix <- merge(featurematrix_dexlps, featurematrix_lps, by=c("anno","label","seqnames"), all=TRUE) # replace missing counts in one of the conditions with 0 merged_featurematrix[is.na(merged_featurematrix)] <- 0 # use dataframe suffix to grab respective columns featurematrix_diff <- merged_featurematrix[grep(".x$",colnames(merged_featurematrix))] - merged_featurematrix[grep(".y$",colnames(merged_featurematrix))] # tidy up column names colnames(featurematrix_diff) <- gsub(".x$","", colnames(featurematrix_diff)) # add first 3 columns back after computing the difference of the counts featurematrix_diff <- cbind(merged_featurematrix[1:3],featurematrix_diff) featurematrix_diff_scaled <- featurematrix_diff %>% mutate(., across(where(is.numeric), ~(scale(.) %>% as.vector))) %>% dplyr::select(where(not_all_na)) # run GLM and look at performance featurematrix_diff_tranval_idx <- featurematrix_diff_scaled %>% with(which(seqnames!="chr1" & seqnames!="chr8" & seqnames!="chr9")) modelname <- paste(motifdata,"condition_DexLPS-LPS_exclprom",excludepromoters,"onlymax",onlymax,"sepPromEnh",sepPromEnh,"weight",weight, sep="_") print(modelname) my_rdsfile <- here(paste0(opt$outdir, modelname,".rds")) if(!file.exists(my_rdsfile)){ performance <- complete_GLM_analysis (featurematrix_diff_scaled, featurematrix_diff_tranval_idx, genenames=genenames) saveRDS(performance,file=my_rdsfile) } else { performance <- readRDS(my_rdsfile) } AUC_metrics <- rbind(AUC_metrics, c( condition="DexLPS-LPS", motifdata=motifdata, excludepromoters=excludepromoters, onlymax=onlymax, weight=weight, sepPromEnh=sepPromEnh, performance[[2]] ) )%>% magrittr::set_rownames(c(rownames(AUC_metrics),modelname)) net_model_coefs[[modelname]] <- performance[[3]] } } } } } #------------------------------------------------ ## put model coefs into dataframes #------------------------------------------------ motifnames <- colnames(motifcounts_summitregion)[3:ncol(motifcounts_summitregion)] # make empty dataframe with all motifs model_coefs_joint <- data.frame(featurename=motifnames) # when we treat promoters and enhancers separately, the featurenames differ model_coefs_sep <- data.frame(featurename= c(paste(motifnames,"genic",sep="_"), paste(motifnames,"promoter",sep="_"), paste(motifnames,"intergenic",sep="_")) ) for (model in names(net_model_coefs)){ if (grepl("sepPromEnh_FALSE",model)){ model_coefs_joint <- merge(model_coefs_joint, net_model_coefs[[model]], by.x="featurename", by.y="names", all=TRUE) %>% dplyr::rename(!! model := "estimates") } else if (model=="prox"){ model_coefs_joint <- merge(model_coefs_joint, net_model_coefs[[model]], by.x="featurename", by.y="names", all=TRUE) %>% dplyr::rename(!! model := "estimates") } else if (grepl("sepPromEnh_TRUE",model)){ model_coefs_sep <- merge(model_coefs_sep, net_model_coefs[[model]], by.x="featurename", by.y="names", all=TRUE) %>% dplyr::rename(!! model := "estimates") } else { stop("The modelnames don't match the expected pattern.") } } #------------------------------------------------ ## save data for plotting #------------------------------------------------ saveRDS (model_coefs_joint, paste0(opt$outdir, "model_coefs_joint.rds")) saveRDS (model_coefs_sep, paste0(opt$outdir, "model_coefs_sep.rds")) saveRDS (AUC_metrics, paste0(opt$outdir, "AUC_metrics.rds")) #------------------------------------------------ sessionInfo() |
R
ggplot2
dplyr
org.Mm.eg.db
optparse
here
ComplexHeatmap
FiMO
plyranges
From
line
1
of
scripts/abc_predictions.r
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 | suppressPackageStartupMessages(library(optparse, warn.conflicts=F, quietly=T)) option_list <- list( make_option(c( "--ABC_DexLPS_all"), type="character", help="Path to file with all ABC predictions passing 0.02 (in DexLPS condition)"), make_option(c("--ABC_LPS_all"), type="character", help="Path to file with all ABC predictions passing 0.02 (in LPS condition)"), make_option(c("--contrast_DexVSDexLPS"), type="character", help="Path to annotated tsv file of DeSeq2 contrast of DexLPS vs LPS"), make_option(c("--chipseq_summits"), type="character", help="Path to summit file of IDR peaks"), make_option(c("--igv"), type="character", help="Path to IGV snapshot of genomic locus (created manually)") ) opt <- parse_args(OptionParser(option_list=option_list)) options(stringsAsFactors = FALSE) suppressPackageStartupMessages(library(dplyr, warn.conflicts=F, quietly=T)) suppressPackageStartupMessages(library(org.Mm.eg.db, warn.conflicts=F, quietly=T)) suppressPackageStartupMessages(library(DESeq2, warn.conflicts=F, quietly=T)) suppressPackageStartupMessages(library(ggplot2, warn.conflicts=F, quietly=T)) suppressPackageStartupMessages(library(ComplexHeatmap, warn.conflicts=F, quietly=T)) suppressPackageStartupMessages(library(TxDb.Mmusculus.UCSC.mm10.knownGene, warn.conflicts=F, quietly=T)) suppressPackageStartupMessages(library(topGO, warn.conflicts=F, quietly=T)) suppressPackageStartupMessages(library(ggExtra, warn.conflicts=F, quietly=T)) suppressPackageStartupMessages(library(plyr, warn.conflicts=F, quietly=T)) suppressPackageStartupMessages(library(here, warn.conflicts=F, quietly=T)) #set defaults for ggplot2 figures theme_update(panel.background = element_rect(fill = "transparent", colour = NA), plot.background = element_rect(fill = "transparent", colour = NA), legend.background = element_rect(fill = "transparent", colour = NA), legend.key = element_rect(fill = "transparent", colour = NA), text=element_text(size=8, family = "ArialMT", colour="black"), title=element_text(size=10, family="ArialMT", colour="black"), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.text = element_text(size=8, family="ArialMT", colour="black"), axis.line = element_line(colour="black"), axis.ticks = element_line(colour="black")) #---------------------------------------------------------------------- ##--NOTE: example code for additional plots can be found in abc_visualizations.Rmd #---------------------------------------------------------------------- #---------------------------------------------------------------------- ##----------- load data #---------------------------------------------------------------------- chipseq_summits <- rtracklayer::import.bed(opt$chipseq_summits) contrast_DexLPSvLPS <- read.delim(opt$contrast_DexVSDexLPS) # rename first column for merge later colnames(contrast_DexLPSvLPS)[1] <- "EnsemblID" ABC_DexLPS_all <- read.delim(opt$ABC_DexLPS_all) ABC_LPS_all <- read.delim(opt$ABC_LPS_all) # remove version number from ensembl IDs ABC_DexLPS_all$TargetGene = gsub("\\.[0-9]*$","",ABC_DexLPS_all$TargetGene) ABC_LPS_all$TargetGene = gsub("\\.[0-9]*$","",ABC_LPS_all$TargetGene) #---------------------------------------------------------------------- ##----------- data exploration #---------------------------------------------------------------------- # What do the distributions of ABC scores look like ggplot()+ geom_density(data=ABC_DexLPS_all %>% filter(!class=="promoter"), aes(x=ABC.Score, colour="DexLPS"))+ geom_density(data=ABC_LPS_all %>% filter(!class=="promoter"), aes(x=ABC.Score, colour="LPS"))+ scale_colour_manual("", breaks = c("DexLPS", "LPS"), values = c("#339966", "#0066CC")) # How do the ABC scores of promoters compare to those of enhancers? ggplot()+ geom_density(data=ABC_DexLPS_all %>% filter(class=="promoter"), aes(x=ABC.Score, colour="promoter"))+ geom_density(data=ABC_DexLPS_all %>% filter(class=="genic"), aes(x=ABC.Score, colour="genic"))+ geom_density(data=ABC_DexLPS_all %>% filter(class=="intergenic"), aes(x=ABC.Score, colour="intergenic"))+ scale_colour_manual("", breaks = c("promoter", "genic", "intergenic"), values = c("red", "green", "blue")) # What distance is used to define sth as promoter? ggplot()+ geom_density(data=ABC_DexLPS_all, aes(x=distance, colour=class))+ scale_x_continuous(trans = "log10")+ scale_colour_manual("", breaks = c("promoter", "genic", "intergenic"), values = c("red", "green", "blue")) ggplot()+ geom_density(data=ABC_DexLPS_all %>% filter(class=="promoter"), aes(x=distance, colour=isSelfPromoter))+ scale_x_continuous(trans = "log10")+ scale_colour_manual("isSelfPromoter", breaks = c("True", "False"), values = c("red", "blue"))+ labs(title="All promoter regions, split by whether they're promoters for their target gene")+ theme( plot.title = element_text(size=12) ) #---------------------------------------------------------------------- # ------ ------ Number of enhancers per gene #---------------------------------------------------------------------- # Are most genes regulated by a single enhancer region or by multiple? How many does the ABC model assign per gene? plot_enhancers_per_gene <- function(){ DexLPS_TargetGeneCounts <- plyr::count(ABC_DexLPS_all %>% filter(!class=="promoter"), vars="TargetGene") LPS_TargetGeneCounts <- plyr::count(ABC_LPS_all %>% filter(!class=="promoter"), vars="TargetGene") gg <- ggplot()+ geom_histogram(data=DexLPS_TargetGeneCounts, aes(x=freq, fill="DexLPS", colour="DexLPS"), binwidth = 1, alpha=0.2)+ geom_histogram(data=LPS_TargetGeneCounts, aes(x=freq, fill="LPS", colour="LPS"), binwidth = 1, alpha=0.2)+ scale_fill_manual("", breaks = c("DexLPS", "LPS"), values = c("#339966", "#0066CC")) + scale_colour_manual("", breaks = c("DexLPS", "LPS"), values = c("#339966", "#0066CC")) + guides(color = "none")+ theme(legend.position = c(0.8,0.8))+ labs(title=" ", x="# of enhancers per gene", y="counts") return(gg) } gg_enh_per_gene <- plot_enhancers_per_gene() gg_enh_per_gene print("We find an average of") plyr::count(ABC_DexLPS_all %>% filter(!class=="promoter"), vars="TargetGene") %>% pull(freq) %>% mean() print("enhancers per gene for the DexLPS condition and") plyr::count(ABC_LPS_all %>% filter(!class=="promoter"), vars="TargetGene") %>% pull(freq) %>% mean() print("in LPS.") #---------------------------------------------------------------------- # ------ ------ Number of genes per enhancer #---------------------------------------------------------------------- # ABC allows for multiple assignments (an individual enhancer assigned to more than one promoter) plot_genes_per_enhancer <- function(){ DexLPS_EnhancerCounts <- plyr::count(ABC_DexLPS_all %>% filter(!class=="promoter"),vars="name") LPS_EnhancerCounts <- plyr::count(ABC_LPS_all %>% filter(!class=="promoter"),vars="name") gg_full <- ggplot()+ geom_histogram(data=DexLPS_EnhancerCounts, aes(x=freq, fill="DexLPS",colour="DexLPS"), binwidth = 1, alpha=0.2)+ geom_histogram(data=LPS_EnhancerCounts, aes(x=freq, fill="LPS",colour="LPS"), binwidth = 1, alpha=0.2)+ scale_fill_manual("", breaks = c("DexLPS", "LPS"), values = c("#339966", "#0066CC")) + scale_colour_manual("", breaks = c("DexLPS", "LPS"), values = c("#339966", "#0066CC")) + guides(color = FALSE)+ labs(title="Number of genes an individual enhancer is assigned to", x="# of genes per enhancer", y="counts") plot(gg_full) gg_zoom <- ggplot()+ geom_histogram(data=DexLPS_EnhancerCounts, aes(x=freq, fill="DexLPS",colour="DexLPS"), binwidth = 1, alpha=0.2)+ geom_histogram(data=LPS_EnhancerCounts, aes(x=freq, fill="LPS",colour="LPS"), binwidth = 1, alpha=0.2)+ scale_fill_manual("", breaks = c("DexLPS", "LPS"), values = c("#339966", "#0066CC")) + scale_colour_manual("", breaks = c("DexLPS", "LPS"), values = c("#339966", "#0066CC")) + xlim(c(0,8))+ guides(color = FALSE)+ theme(legend.position = c(0.8,0.8))+ labs(title=" ", x="# of genes per enhancer", y="counts") plot(gg_zoom) print("We find an average of") DexLPS_EnhancerCounts %>% pull(freq) %>% mean() %>% print() print("genes per enhancer in the DexLPS condition and") LPS_EnhancerCounts %>% pull(freq) %>% mean() %>% print() print("in LPS.") return(gg_zoom) } gg_genes_per_enhancer_zoom <- plot_genes_per_enhancer() #---------------------------------------------------------------------- # ------ Comparing ABC Scores (including promoter regions) between the conditions #---------------------------------------------------------------------- # merge regions from both conditions but also keep those that are only in one # # add info no whether the target genes are DE or now and what direction they're changed LPS_GR<- plyranges::as_granges(ABC_LPS_all, seqnames=chr) DexLPS_GR <- plyranges::as_granges(ABC_DexLPS_all, seqnames=chr) #---------------------------------------------------------------------- merge_LPSandDexLPS_ABCdata <- function(ABC_LPS_all_GR, ABC_DexLPS_all_GR){ ABC_LPS_all_unique <- unique(ABC_LPS_all_GR) ABC_DexLPS_all_unique <- unique(ABC_DexLPS_all_GR) # and then figure out which ones overlap with one another overlap_ABC_all_unique <- ChIPpeakAnno::findOverlapsOfPeaks (ABC_LPS_all_unique, ABC_DexLPS_all_unique, connectedPeaks = "keepAll", ignore.strand = TRUE) # we coerce the info of overlapping regions into a dataframe overlap_ABC_all_unique_df <- as.data.frame(overlap_ABC_all_unique$overlappingPeaks) # and clean up column names colnames(overlap_ABC_all_unique_df)<- gsub("ABC_LPS_all_unique...ABC_DexLPS_all_unique.","", colnames(overlap_ABC_all_unique_df)) colnames(overlap_ABC_all_unique_df)<- gsub(".1", "", colnames(overlap_ABC_all_unique_df)) colnames(overlap_ABC_all_unique_df)<- paste( c(rep("LPS_",27),rep("DexLPS_",27),rep("",2)), colnames(overlap_ABC_all_unique_df), sep="") # we use this overlap to define a new variable that holds the info of which regions of the conditions form a pair overlap_ABC_all_unique_df$pairID <- paste("pair", seq(1, nrow(overlap_ABC_all_unique_df)), sep="" ) #-----------------for DexLPS------------------- # merge the pairID onto the original dataframes ABC_DexLPS_all_wpairID <- merge(ABC_DexLPS_all, overlap_ABC_all_unique_df[,c("DexLPS_name","pairID")], by.x="name", by.y="DexLPS_name", all.x=TRUE) # for those that don't correspond to a pair, we assign condition and name as pairID ABC_DexLPS_all_wpairID <- ABC_DexLPS_all_wpairID %>% mutate(pairID = case_when(is.na(pairID) ~ paste("DexLPS",name, sep="_"), TRUE ~ pairID) ) # combination of pairID and gene they're assigned to forms the assignment variable. # we will use this later to plot values from the 2 conditions, that have the same assignment ABC_DexLPS_all_wpairID$assignment <- paste(ABC_DexLPS_all_wpairID$pairID, ABC_DexLPS_all_wpairID$TargetGene, sep="_") #-----------------for LPS------------------- # merge the pairID onto the original dataframes ABC_LPS_all_wpairID <- merge(ABC_LPS_all, overlap_ABC_all_unique_df[,c("LPS_name","pairID")], by.x="name", by.y="LPS_name", all.x=TRUE) # for those that don't correspond to a pair, we assign condition and name as pairID ABC_LPS_all_wpairID <- ABC_LPS_all_wpairID %>% mutate(pairID = case_when(is.na(pairID) ~ paste("LPS",name, sep="_"), TRUE ~ pairID) ) ABC_LPS_all_wpairID$assignment <- paste(ABC_LPS_all_wpairID$pairID, ABC_LPS_all_wpairID$TargetGene, sep="_") merged_conditions <- merge(ABC_DexLPS_all_wpairID, ABC_LPS_all_wpairID, by="assignment", all=TRUE, suffixes = c(".DexLPS", ".LPS")) merged_conditions <- merged_conditions %>% mutate(TargetGene = case_when(TargetGene.DexLPS==TargetGene.LPS ~ TargetGene.DexLPS, is.na(TargetGene.DexLPS) ~ TargetGene.LPS, is.na(TargetGene.LPS) ~ TargetGene.DexLPS, TRUE ~ NA_character_) ) return(merged_conditions) } merged_conditions_ABC <- merge_LPSandDexLPS_ABCdata(LPS_GR,DexLPS_GR) #---------------------------------------------------------------------- # ------ merge on gene expression results #---------------------------------------------------------------------- # merge the RNAseq results to use as colour later merged_conditions_ABC <- merge(merged_conditions_ABC, contrast_DexLPSvLPS, by.x="TargetGene", by.y="EnsemblID") merged_conditions_ABC <- merged_conditions_ABC %>% mutate(change=case_when(padj<0.05 & log2FoldChange>0.58 ~ "up", padj<0.05 & log2FoldChange<(-0.58) ~ "down", TRUE ~ "no change")) # replace the na with 0 merged_conditions_ABCnomissing <- merged_conditions_ABC %>% mutate(ABC.Score.DexLPS = case_when(is.na(ABC.Score.DexLPS) ~ 0, TRUE ~ ABC.Score.DexLPS) ) %>% mutate(ABC.Score.LPS = case_when(is.na(ABC.Score.LPS) ~ 0, TRUE ~ ABC.Score.LPS) ) cor_ABCscores <- cor(merged_conditions_ABCnomissing$ABC.Score.DexLPS, merged_conditions_ABCnomissing$ABC.Score.LPS) gg_merged_conditions_ABCnomissing <- ggplot(merged_conditions_ABCnomissing)+ geom_point(aes(x=ABC.Score.DexLPS, y=ABC.Score.LPS, fill=log2FoldChange), alpha=0.2, size=1, stroke = 0, shape=21)+ scale_fill_gradient(low="blue", high="red")+ ylim(c(0,0.8))+ xlim(c(0,0.8))+ geom_text(x=0.2, y=0.75, label=paste("r = ",format(cor_ABCscores, digits = 2)))+ labs(fill="log2FC", x="ABC score Dex+LPS", y="ABC score LPS")+ theme(legend.position = "top") gg_merged_conditions_ABCnomissing #---------------------------------------------------------------------- # ------ plot foldchange in ABC score vs foldchange in expression #---------------------------------------------------------------------- cor_ABCdiff_log2FC <- with(merged_conditions_ABCnomissing %>% filter(change!="no change"), cor((ABC.Score.DexLPS - ABC.Score.LPS),log2FoldChange)) gg_ABCdiff_log2FC <- ggplot(merged_conditions_ABCnomissing %>% filter(change!="no change"), aes(x=(ABC.Score.DexLPS - ABC.Score.LPS), y=log2FoldChange)) + geom_hex(bins=70)+ viridis::scale_fill_viridis()+ geom_smooth(method = "lm", colour="black")+ geom_text(x=0.6, y=5, label=paste("r = ",format(cor_ABCdiff_log2FC, digits = 2)))+ labs(x="ABC score Dex+LPS - ABC score LPS", y="expression log2FC(Dex+LPS / LPS)")+ theme(legend.position = "top") #---------------------------------------------------------------------- # ------ ABC score by peak presence #---------------------------------------------------------------------- plot_ABC_with_marginals <- function(whatchange){ gg <- ggplot(data= merged_conditions_ABCnomissing %>% filter(change==whatchange) , aes(x=ABC.Score.DexLPS, y=ABC.Score.LPS ) )+ geom_point( aes( fill=whatchange) , size=1, alpha=1,stroke = 0, shape=21) + geom_abline(slope = 1)+ ylim(c(0,0.8))+ xlim(c(0,0.8))+ labs(x="ABC score Dex+LPS", y="ABC score LPS")+ scale_fill_manual( breaks=c("up","no change","down"), values=c("red","black","blue") )+ theme(legend.position = "none") gg_marg <- ggExtra::ggMarginal(gg, type="histogram", size=2, xparams = list(bins=85), yparams = list(bins=85)) return(gg_marg) } ggplot_ABC_with_marginals_up <- plot_ABC_with_marginals("up") ggplot_ABC_with_marginals_up ggplot_ABC_with_marginals_down <-plot_ABC_with_marginals("down") ggplot_ABC_with_marginals_down #---------------------------------------------------------------------- # ------ show marginals as bar plot #---------------------------------------------------------------------- gg_marginals_dexlps <- ggplot()+ geom_histogram(data=merged_conditions_ABCnomissing %>% filter(change=="up"), aes(x=ABC.Score.DexLPS, fill="up"), alpha=0.2, bins=85)+ geom_histogram(data=merged_conditions_ABCnomissing %>% filter(change=="down"), aes(x=ABC.Score.DexLPS, fill="down"), alpha=0.2, bins=85)+ scale_fill_manual( breaks=c("up","down"), values=c("red","blue") )+ xlim(c(NA,0.3))+ labs(fill="gene expression change",x="ABC score DexLPS", y="ABC score LPS")+ theme(legend.position = c(0.7,0.7)) gg_marginals_lps <- ggplot()+ geom_histogram(data=merged_conditions_ABCnomissing %>% filter(change=="up"), aes(x=ABC.Score.LPS, fill="up"), alpha=0.2, bins=85)+ geom_histogram(data=merged_conditions_ABCnomissing %>% filter(change=="down"), aes(x=ABC.Score.LPS, fill="down"), alpha=0.2, bins=85)+ scale_fill_manual( breaks=c("up","down"), values=c("red","blue") )+ xlim(c(NA,0.3))+ labs(fill="gene expression change",x="ABC score DexLPS", y="ABC score LPS")+ theme(legend.position = c(0.7,0.7)) #---------------------------------------------------------------------- # ------ ABC score by peak presence #---------------------------------------------------------------------- # Is there a difference in ABC score between the enhancers overlapping with a GR summit vs the ones that don't? DexLPS_GR_summits_overlap <- IRanges::findOverlaps(DexLPS_GR,chipseq_summits) #(query,subject) DexLPS_GR$haspeakID <- FALSE DexLPS_GR$haspeakID[queryHits(DexLPS_GR_summits_overlap)] <- TRUE plot_ABCscore_ofDEgenes_byGRoverlap <- function(){ DE_ENSEMBL <- contrast_DexLPSvLPS %>% filter(padj<0.05 & abs(log2FoldChange)>0.58 ) %>% pull(EnsemblID) DexLPS_enhancers_DEsubset <- as.data.frame(DexLPS_GR) %>% filter(TargetGene %in% DE_ENSEMBL) %>% filter(!class=="promoter") DexLPS_enhancers_DEsubset %>% filter(haspeakID==TRUE) %>% dplyr::pull(ABC.Score) %>% mean() %>% print() DexLPS_enhancers_DEsubset %>% filter(haspeakID==FALSE) %>% dplyr::pull(ABC.Score) %>% mean() %>% print() # show distribution for the two groups gg1 <- ggplot()+ geom_density(data=DexLPS_enhancers_DEsubset %>% filter(haspeakID==TRUE), aes(x=ABC.Score, colour="haspeakID", fill="haspeakID"), alpha=0.3)+ geom_density(data=DexLPS_enhancers_DEsubset %>% filter(haspeakID==FALSE), aes(x=ABC.Score, colour="nopeakID", fill="nopeakID"), alpha=0.3)+ scale_x_continuous(trans = "log2")+ scale_colour_manual("", labels = c("has GR peak", "no GR peak"), breaks = c("haspeakID", "nopeakID"), values = c("darkmagenta", "cadetblue"))+ scale_fill_manual("", labels = c("has GR peak", "no GR peak"), breaks = c("haspeakID", "nopeakID"), values = c("darkmagenta", "cadetblue"))+ guides(colour="none")+ #xlim(c(-3,0))+ labs(title=" ", x="ABC score Dex+LPS", y="density")+ theme( axis.text = element_text(size=12, family="ArialMT", colour="black"), axis.title = element_text(size=12, family="ArialMT", colour="black"), axis.text.x = element_text(angle = 45, vjust = 1, hjust=1), legend.position = c(0.8,0.8) ) plot(gg1) gg2 <- DexLPS_enhancers_DEsubset %>% ggplot( aes(x=hic_contact_pl_scaled_adj, y=log2(activity_base))) + geom_hex()+ facet_wrap(~haspeakID, labeller="label_both")+ viridis::scale_fill_viridis()+ labs(title = " ", x="Hi-C contact", y="log2(base activity)") plot(gg2) all_plots=list() all_plots[[1]] <- gg1 all_plots[[2]] <- gg2 return(all_plots) } gg_ABCscore_ofDEgenes_byGRoverlap <- plot_ABCscore_ofDEgenes_byGRoverlap() #---------------------------------------------------------------------- # ------ load IGV plot #---------------------------------------------------------------------- igv <- png::readPNG( opt$igv ) gg_igv <- ggplot() + ggpubr::background_image(igv) + # so it doesn't get squished #coord_fixed()+ # This ensures that the image leaves some space at the edges theme(plot.margin = margin(t=0, l=0, r=0, b=0, unit = "cm"), axis.line = element_blank()) #---------------------------------------------------------------------- # ------ put figure panel together #---------------------------------------------------------------------- # save giant files separately ggsave(here("./results/current/Figures/Figure_abcresults_3B.bmp"), gg_merged_conditions_ABCnomissing, width=75, height=100, units="mm", bg="white") ggsave(here("./results/current/Figures/Figure_abcresults_3B.png"), gg_merged_conditions_ABCnomissing, width=75, height=100, units="mm", bg="white") gg_placeholder <- ggplot() + theme(plot.margin = margin(t=0, l=0, r=0, b=0, unit = "cm"), axis.line = element_blank()) # first row gg_r1c1 <- ggpubr::ggarrange(gg_enh_per_gene, gg_genes_per_enhancer_zoom, labels = c("A", NA), ncol = 1, nrow = 2, heights = c(1,1)) gg_r1c2 <- ggpubr::ggarrange(gg_placeholder, ggplot_ABC_with_marginals_up, ggplot_ABC_with_marginals_down, labels = c("B", "C", NA), ncol = 3, nrow=1, widths=c(1,1,1)) gg_r1 <- ggpubr::ggarrange(gg_r1c1, gg_r1c2, labels = c(NA, NA), ncol = 2, nrow=1, widths=c(1,3)) # second row gg_r2 <- ggpubr::ggarrange(gg_ABCdiff_log2FC, gg_ABCscore_ofDEgenes_byGRoverlap[[1]], gg_ABCscore_ofDEgenes_byGRoverlap[[2]], labels = c("D","E", "F"), ncol = 3, nrow=1, widths=c(1,1,1.5)) gg_tophalf <- ggpubr::ggarrange(gg_r1, gg_r2, nrow=2, heights =c(1,1)) gg_tophalf full_panel <- ggpubr::ggarrange(gg_tophalf, gg_igv, labels=c(NA,"G"), nrow=2, heights = c(1.8,1)) #full_panel ggsave(here("./results/current/Figures/Figure_abcresults.png"), full_panel, width=300, height=300, units="mm", bg="white") ggsave(here("./results/current/Figures/Figure_abcresults.pdf"), full_panel, width=300, height=300, units="mm", bg="white") |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 | suppressPackageStartupMessages(library(optparse, warn.conflicts=F, quietly=T)) option_list <- list( make_option(c("--ctss_pool1"), type="character", help="Path for ctss file from FANTOM5 of pool 1"), make_option(c("--ctss_pool2"), type="character", help="Path for ctss file from FANTOM5 of pool 2"), make_option(c("--liftoverchain"), type="character", help="Path to liftover chain file for mm9 to mm10"), make_option(c("--gencode_mm9_geneanno"), type="character", help="Path to genomic reference file for mm9"), make_option(c("--gencode_mm10_geneanno"), type="character", help="GENCODE genomic reference for assembly mm10, prefiltered for gene entries"), make_option(c("--outdir"), type="character", help="Output directory") ) opt <- parse_args(OptionParser(option_list=option_list)) suppressPackageStartupMessages(library(dplyr, warn.conflicts=F, quietly=T)) suppressPackageStartupMessages(library(BSgenome.Mmusculus.UCSC.mm9, warn.conflicts=F, quietly=T)) suppressPackageStartupMessages(library(ChIPseeker, warn.conflicts=F, quietly=T)) suppressPackageStartupMessages(library(CAGEr, warn.conflicts=F, quietly=T)) suppressPackageStartupMessages(library(rtracklayer, warn.conflicts=F, quietly=T)) suppressPackageStartupMessages(library(ggplot2, warn.conflicts=F, quietly=T)) suppressPackageStartupMessages(library(GenomicRanges, warn.conflicts=F, quietly=T)) suppressPackageStartupMessages(library(here, warn.conflicts=F, quietly=T)) suppressPackageStartupMessages(library(org.Mm.eg.db, warn.conflicts=F, quietly=T)) suppressPackageStartupMessages(library(TxDb.Mmusculus.UCSC.mm10.knownGene, warn.conflicts=F, quietly=T)) outdir <- here(opt$outdir) # Workflow is based on the CAGEr vignette # https://www.bioconductor.org/packages/release/bioc/vignettes/CAGEr/inst/doc/CAGEexp.html #---------------------- Import CAGE samples from BMDMs #------------------------------------------------------------------------------------ # get URL for public samples, download and read necessariy columns into ctss file (see snakemake workflow) # We want to import bone-marrow derived macrophage samples through CAGEr. # After looking at list of available datasets we can decide what samples best fit our needs. # Let's see what samples are available through FANTOM5 #data(FANTOM5mouseSamples) #head(FANTOM5mouseSamples) # The FANTOM5 dataframe holds descriptions of the samples and the url where they can be retrieved. # There's an easy way to import samples that match a certain term into a CAGEset object. #mac_samples <- FANTOM5mouseSamples[grep("macrophage, bone marrow derived", # FANTOM5mouseSamples[,"description"]),] print("NOTE: reference genome for the public CAGE files is mm9!") ce <- CAGEr::CAGEexp( genomeName = "BSgenome.Mmusculus.UCSC.mm9", inputFiles = c(opt$ctss_pool1 ,opt$ctss_pool2), inputFilesType = "ctss", sampleLabels = c("pool1","pool2") ) # To actually read in the data into the object we use getCTSS() function, that will add an experiment called tagCountMatrix to the CAGEexp object. ce <- CAGEr::getCTSS(ce) ce #------------------------------------------------------------------------------------ #---------------------- QC #------------------------------------------------------------------------------------ ncbim37_anno <- rtracklayer::import.gff(opt$gencode_mm9_geneanno) ce <- annotateCTSS(ce, ncbim37_anno) colData(ce)[,c("librarySizes", "promoter", "exon", "intron", "unknown")] plotAnnot(ce, "counts") corr.m <- plotCorrelation2( ce, samples = "all" , tagCountThreshold = 1, applyThresholdBoth = FALSE , method = "pearson") #------------------------------------------------------------------------------------ #---------------------- Get read clusters #------------------------------------------------------------------------------------ print("Merging samples") #Now we can merge them ce <- mergeSamples(ce, mergeIndex = c(1,1), mergedSampleLabels = c("BMDM")) # redo annotation since this gets reset during merging ce <- annotateCTSS(ce, ncbim37_anno) print("The total library size is:") print(librarySizes(ce)) # Check if data follows a power law distribution plotReverseCumulatives(ce, fitInRange = c(5, 3000), onePlot = TRUE) print("Normalizing reads") # Since we don't really care about making comparisons between different population we could prob just skip the normalization # The fit range is chosen from the plot. We take the alpha from the ref distribution and set T to a million to get the tag count per million (TPM) ce <- normalizeTagCount(ce, method = "powerLaw", fitInRange = c(5, 3000), alpha = 1.15, T = 1*10^6) #mac_CAGEset@tagCountMatrix print("Cluster the tags") # After normalization we can cluster the tags. # Clustering, only seems to work with the CAGEset object (due to some problems with the IRanges column) # From the CAGEr vignette: # "Transcription start sites are found in the promoter region of a gene and reflect the transcriptional activity of that promoter (Figure 5). TSSs in the close proximity of each other give rise to a functionally equivalent set of transcripts and are likely regulated by the same promoter elements. Thus, TSSs can be spatially clustered into larger transcriptional units, called tag clusters (TCs) that correspond to individual promoters. CAGEr supports three methods for spatial clustering of TSSs along the genome, two ab initio methods driven by the data itself, as well as assigning TSSs to predefined genomic regions:" ce <- clusterCTSS(ce, threshold=1, thresholdIsTpm = TRUE, nrPassThreshold = 1, method="distclu", maxDist=20, removeSingletons = TRUE, keepSingletonsAbove = 3) # Let's have a look what the result looks like head(tagClustersGR(ce, sample = "BMDM")) # calculate cumulative distribution for every tag cluster in each of the samples ce <- cumulativeCTSSdistribution(ce, clusters = "tagClusters", useMulticore = T) # determine the positions of selected quantiles ce <- quantilePositions(ce, clusters = "tagClusters", qLow = 0.1, qUp = 0.9) # How many tagclusters do we have in total? length(tagClustersGR(ce, sample = "BMDM")) # histogram of interquantile width plotInterquantileWidth(ce, clusters = "tagClusters", tpmThreshold = 3, qLow = 0.1, qUp = 0.9) print("Retrieving clusters as GenomicRanges") clusters_gr <- tagClustersGR(ce, sample="BMDM") #------------------------------------------------------------------------------------ #---------------------- Liftover coordinates to mm10 #------------------------------------------------------------------------------------ print("Liftover to mm10") # * Now we can lift over the intervals to mm10 # * Annotate them with peakanno # * pick the most highly expressed one for each gene liftover <- function(peaks_gr_mm9){ #input is a GenomicRanges object in mm9 coordinates #lift peak locations from mm9 to mm10 chain <- rtracklayer::import.chain(opt$liftoverchain) on.exit( close( file(opt$liftoverchain)) ) peaks_gr_mm10 <- rtracklayer::liftOver(peaks_gr_mm9, chain) peaks_gr_mm10 <- GenomicRanges::GRanges(unlist(peaks_gr_mm10)) return(peaks_gr_mm10) } mac_cage_mm10 <- liftover( clusters_gr ) ggplot(as.data.frame(mac_cage_mm10), aes(x=width)) + geom_histogram(bins = 100) # Liftover coordinates of dominant_ctss dominant_ctss <- liftover( GRanges( seqnames = seqnames(clusters_gr), ranges = IRanges(start = clusters_gr$dominant_ctss, end = clusters_gr$dominant_ctss), score=clusters_gr$score) ) #------------------------------------------------------------------------------------ #----------------- Annotate TSS clusters to reference gene coordinates #------------------------------------------------------------------------------------ print("Annotate TSS coordinates") # Use coordinates of the dominant ctss downstream mac_cage_anno <- ChIPseeker::annotatePeak(dominant_ctss, tssRegion=c(-1000, 1000), #more stringent than default level = "gene", TxDb=TxDb.Mmusculus.UCSC.mm10.knownGene, annoDb = "org.Mm.eg.db") # For those that are reasonably close to a TSS, # check for each gene, which position has the highest score. mac_cage_maxscore <- as.data.frame(mac_cage_anno) %>% filter(abs(distanceToTSS)<=30000) %>% mutate(SYMBOL=as.factor(SYMBOL)) %>% filter(SYMBOL!="") %>% group_by(SYMBOL)%>% filter(score == max(score))%>% filter(distanceToTSS == min(distanceToTSS )) # for tied score, use shorter distance nrow(mac_cage_maxscore) ggplot(mac_cage_maxscore, aes(x=distanceToTSS)) + geom_histogram(bins = 100) #------------------------------------------------------------------------------------ #---------- retrieve gene coordinates and promoterregion from reference #------------------------------------------------------------------------------------ gencode_mm10_geneanno <- rtracklayer::import.gff(opt$gencode_mm10_geneanno) genecoords <- as.data.frame(gencode_mm10_geneanno) %>% dplyr::select("seqnames","start","end","strand","gene_id") %>% mutate(score=0) %>% dplyr::mutate(gene_id=gsub("\\.[0-9]*$","",gene_id)) %>% dplyr::filter(!"gene_id"=="")%>% dplyr::select("seqnames","start","end","gene_id","score","strand") gencode_mm10_promoterregions <- promoters(gencode_mm10_geneanno) gencode_mm10_promoterregions <- as.data.frame(gencode_mm10_promoterregions) %>% dplyr::select("seqnames","start","end","strand","gene_id") %>% mutate(score=0) %>% dplyr::mutate(gene_id=gsub("\\.[0-9]*$","",gene_id)) %>% dplyr::filter(!"gene_id"=="")%>% dplyr::select("seqnames","start","end","gene_id","score","strand") #------------------------------------------------------------------------------------ #----------------- export files #------------------------------------------------------------------------------------ write.table(genecoords, file = paste0(outdir,"reference_genecoords.bed"), sep="\t", col.names = FALSE, quote=FALSE, row.names = FALSE) write.table(gencode_mm10_promoterregions, file = paste0(outdir,"reference_promoterregions.bed"), sep="\t", col.names = FALSE, quote=FALSE, row.names = FALSE) rtracklayer::export.bed(as.data.frame(mac_cage_mm10), paste0(outdir, "mac_cage_tssclusterregions.bed"), format="bed") rtracklayer::export.bed(as.data.frame(dominant_ctss), paste0(outdir, "mac_cage_dominant_ctss.bed"), format="bed") rtracklayer::export.bed(mac_cage_maxscore, paste0(outdir,"mac_cage_maxscore.bed"), format="bed") |
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 | suppressPackageStartupMessages(library(optparse, warn.conflicts=F, quietly=T)) option_list <- list( make_option(c( "--contrast_DexLPSvLPS"), type="character", help="Path to annotated tsv file of DeSeq2 contrast of DexLPS vs LPS"), make_option(c("--assignment_summit_prox"), type="character", help="Path to rds file of proximity based assignment of peak summits to genes"), make_option(c("--assignment_summits_abcregion_dexlps"), type="character", help="Path to rds file of assignment of peak summits within abcregions to genes (in DexLPS condition)"), make_option(c("--assignment_summits_abcregion_lps"), type="character", help="Path to rds file of assignment of peak summits within abcregions to genes (in LPS condition)"), make_option(c("--assignment_abcregion_dexlps"), type="character", help="Path to rds file of assignment of abcregions to genes (in DexLPS condition)"), make_option(c( "--assignment_abcregion_lps"), type="character", help="Path to rds file of assignment of abcregions to genes (in LPS condition)"), make_option(c("--motifcounts_summitregion"), type="character", help="Path to rds file of fimo motifcounts within summitregions"), make_option(c("--motifcounts_abcregion_dexlps"), type="character", help="Path to rds file of fimo motifcounts within ABC regions (in DexLPS condition)"), make_option(c("--motifcounts_abcregion_lps"), type="character", help="Path to rds file of fimo motifcounts within ABC regions (in LPS condition)"), make_option(c("--model_coefs_joint"), type="character", help="Path to rds file with model coefficients of joint models"), make_option(c("--model_coefs_sep"), type="character", help="Path to rds file with model coefficients of models tha include enhancers and promoters separately"), make_option(c( "--featuredir"), type="character", help="Path to directory with unscales featurematrizes"), make_option(c( "--outfile"), type="character", help="Path to output file with metrics") ) opt <- parse_args(OptionParser(option_list=option_list)) dir.create(opt$featuredir) #change default for stringAsFactors options(stringsAsFactors = FALSE) suppressPackageStartupMessages(library(here, warn.conflicts=F, quietly=T)) suppressPackageStartupMessages(library(ggplot2, warn.conflicts=F, quietly=T)) suppressPackageStartupMessages(library(plyranges, warn.conflicts=F, quietly=T)) suppressPackageStartupMessages(library(org.Mm.eg.db, warn.conflicts=F, quietly=T)) suppressPackageStartupMessages(library(DESeq2, warn.conflicts=F, quietly=T)) suppressPackageStartupMessages(library(ComplexHeatmap, warn.conflicts=F, quietly=T)) suppressPackageStartupMessages(library(dplyr, warn.conflicts=F, quietly=T)) #set defaults for ggplot2 figures theme_update(panel.background = element_rect(fill = "transparent", colour = NA), plot.background = element_rect(fill = "transparent", colour = NA), legend.background = element_rect(fill = "transparent", colour = NA), legend.key = element_rect(fill = "transparent", colour = NA), text=element_text(size=6, family = "ArialMT", colour="black"), title=element_text(size=8, family="ArialMT", colour="black"), panel.grid.major = element_line(colour="grey", size=0.2), panel.grid.minor = element_blank(), axis.text = element_text(size=6, family="ArialMT", colour="black"), axis.line = element_line(colour="black"), axis.ticks = element_line(colour="black"), legend.key.size = unit(6, 'points'), #change legend key size legend.key.height = unit(6, 'points'), #change legend key height legend.key.width = unit(6, 'points'), #change legend key width legend.text = element_text(size=6, family="ArialMT", colour="black")) set.seed(12345) #------------------------------- ## read in data #------------------------------- contrast_DexLPSvLPS <- read.delim(opt$contrast_DexLPSvLPS) for (optname in names(opt)[2:11]){ #except for the featuredir print(paste0("Loading ", optname)) assign(optname, readRDS( opt[[optname]] )) } #-------------------------------------------- ## ---- function definitions #-------------------------------------------- coerce_coef2df <- function(model_coef){ model_coef <- as.matrix(model_coef) model_coef <- as.data.frame(model_coef) model_coef$names <- rownames(model_coef) colnames(model_coef) <- c("estimates", "names") return(model_coef) } #------------function will------------: # * merge the motifdata with gene assignments and # * then merge the gene expression changes, # * filter for DE genes and aggregate per gene #-------------------------------------- merge_motifdata_with_assignments <- function( motifcounts, assignments, contrast, maxonly=FALSE, excludepromoters=FALSE, weightby=FALSE, sepPromEnh=FALSE){ # check if we should use all abc assignments, or only the max one of each peakID if(maxonly==TRUE){ assignments <- assignments %>% group_by(name) %>% filter(abcscore==max(abcscore)) %>% distinct() } else{ assignments <- assignments } if(excludepromoters=="all"){ assignments <- assignments %>% filter(!class=="promoter") } else if (excludepromoters=="onlyNONself"){ assignments <- assignments %>% filter(!c(class=="promoter" & isSelfPromoter=="False")) } else{ assignments <- assignments } motifdf <- merge(motifcounts, assignments, by.x="name", by.y="name") motifdf <- motifdf %>% relocate(c(anno)) # merge the expression change # use mgi_symbol for prox based, otherwise ensemblID # We DONT set all.x=TRUE because we don't care about predicting gene that aren't even expressed or that we don't have a clear label for if(maxonly=="prox"){ motifdf <- merge(motifdf, contrast, by.x="anno", by.y="mgi_symbol") } else { motifdf <- merge(motifdf, contrast, by.x="anno", by.y="Row.names") } #recode the logFC and padj into a label (optionally through command line arguments) motifdf <- motifdf %>% mutate(label=case_when(log2FoldChange>0.58 & padj < 0.05 ~ "up", log2FoldChange<(-0.58) & padj < 0.05 ~ "down", TRUE ~ "no_change")) %>% filter(label!="no_change") %>% mutate(label=factor(label, levels=c("down","up"), labels=c(0,1))) %>% relocate(label) # chr 2, 3 and 4 (20%) were used as the tuning set for hyperparameter tuning. # Regions from chromosomes 1, 8 and 9 (20%) were used as the test set for performance evaluation # The remaining regions were used for model training. #------aggregate over gene SYMBOL if ("abcscore" %in% colnames(motifdf)) { # loop for ABC based assignments if (weightby=="abcscore"){ unselect_col <- "abcnumerator" } else if(weightby=="abcnumerator") { unselect_col <- "abcscore" } else{ unselect_col <- c("abcscore","abcnumerator") } if (sepPromEnh==TRUE){ motifdf_aggr <- motifdf %>% dplyr::select(!c(unselect_col,"name","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj","gene_biotype","mgi_symbol")) %>% { if(weightby!=FALSE) mutate(.,across(where(is.numeric), ~ (.x * get(weightby)))) else .} %>% # weight features by score dplyr::select(!any_of(as.character(weightby))) %>% # then we can drop the score since it will be nonsensical after the aggregation anyways group_by(label,seqnames,anno,class) %>% dplyr::summarise(across( where(is.numeric), .fns=sum )) %>% # sum up genewise feature counts ungroup() # From here we need to cast the motifcounts for the promoterregions, so that in the end we have one row per gene (instead of 1-2) motifdf_aggr <- motifdf_aggr %>% tidyr::pivot_wider(id_cols=c(label,seqnames,anno), names_from=class, values_from = !c(label,seqnames,anno, class), values_fill = 0) } else { motifdf_aggr <- motifdf %>% dplyr::select(!c(unselect_col,"name","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj","gene_biotype","mgi_symbol")) %>% { if(weightby!=FALSE) mutate(., across(where(is.numeric), ~ (.x * get(weightby)))) else .} %>% # weight features by score dplyr::select(!any_of(as.character(weightby))) %>% # then we can drop the score since it will be nonsensical after the aggregation anyways group_by(label,seqnames,anno) %>% dplyr::summarise(across( where(is.numeric), .fns=sum )) %>% # sum up genewise feature counts ungroup() } } else { # loop for prox based assignments motifdf_aggr <- motifdf %>% dplyr::select(!c("name","baseMean","log2FoldChange","lfcSE","stat","pvalue","padj","gene_biotype")) %>% group_by(label,seqnames,anno) %>% dplyr::summarise(across( where(is.numeric), .fns=sum )) %>% # sum up genewise feature counts ungroup() } return(motifdf_aggr) } get_feature_and_label_metrics <- function(motifdf, trainvalidx, genenames){ targets_train <- motifdf[ trainvalidx, ] %>% pull(label) %>% as.numeric(levels(.))[.] %>% as.matrix() targets_test <- motifdf[ -trainvalidx, ] %>% pull(label) %>% as.numeric(levels(.))[.] %>% as.matrix() metrics = data.frame( n_input_features = ncol( motifdf[ , -c(1,2,3)]),# first three columns are labels, chromosome and gene annotation n_neg_inst_train = table(targets_train)[['0']], n_pos_inst_train = table(targets_train)[['1']], n_neg_inst_test = table(targets_test)[['0']], n_pos_inst_test = table(targets_test)[['1']] ) return(metrics) } #------------------------------------------------ ## initialize object to gather all metrics #------------------------------------------------ all_metrics=data.frame( n_input_features=numeric(), n_neg_inst_train=numeric(), n_pos_inst_train=numeric(), n_neg_inst_test=numeric(), n_pos_inst_test=numeric() ) #------------------------------------------------ ## run proximity based #------------------------------------------------ #---------------------- # This is independent of the ABC results (no need to loop through different assignment variations) print("Getting metrics of prox-based model") my_rdsfile <- paste0(opt$featuredir,"prox.rds") if(!file.exists(my_rdsfile)){ motifdata_aggr_prox <- merge_motifdata_with_assignments(motifcounts_summitregion, assignment_summit_prox, contrast_DexLPSvLPS, maxonly="prox", excludepromoters=FALSE, weightby = FALSE, sepPromEnh = FALSE) saveRDS(motifdata_aggr_prox,file=my_rdsfile) } else{ motifdata_aggr_prox <- readRDS(my_rdsfile) } motifdata_aggr_prox_scaled <- motifdata_aggr_prox %>% mutate(., across(where(is.numeric), ~(scale(.) %>% as.vector))) motifdata_aggr_prox_tranval_idx <- motifdata_aggr_prox_scaled %>% with(which(seqnames!="chr1" & seqnames!="chr8" & seqnames!="chr9")) metrics_prox <- get_feature_and_label_metrics (motifdata_aggr_prox_scaled, motifdata_aggr_prox_tranval_idx, genenames=genenames) all_metrics <- rbind(all_metrics, "prox"=metrics_prox) #---------------------- not_all_na <- function(x) any(!is.na(x)) for (motifdata in c("motifcounts_abcregion","motifcounts_summitregion")){ for(excludepromoters in c(FALSE,"all","onlyNONself")){ for (onlymax in c(TRUE,FALSE)){ for (sepPromEnh in c(TRUE,FALSE)){ for (weight in c(FALSE, "abcscore")){ # set assignments fitting for the input data if(motifdata=="motifcounts_abcregion"){ assignment_dexlps <- assignment_abcregion_dexlps assignment_lps <- assignment_abcregion_lps motifcounts_dexlps <- motifcounts_abcregion_dexlps motifcounts_lps <- motifcounts_abcregion_lps } else if (motifdata=="motifcounts_summitregion" ){ # in this case the motifcounts for the 2 conditions are the same, but there assignments differ assignment_dexlps <- assignment_summits_abcregion_dexlps assignment_lps <- assignment_summits_abcregion_lps motifcounts_dexlps <- motifcounts_summitregion motifcounts_lps <- motifcounts_summitregion } else {break} #-----------------------DEXLPS------------------------------ modelname <- paste(motifdata,"condition_dexlps_exclprom",excludepromoters,"onlymax",onlymax,"sepPromEnh",sepPromEnh,"weight",weight, sep="_") print(modelname) my_rdsfile <- here( paste0(opt$featuredir, modelname,".rds")) if(!file.exists(my_rdsfile)){ featurematrix_dexlps <- merge_motifdata_with_assignments(motifcounts_dexlps, assignment_dexlps, contrast_DexLPSvLPS, weightby=weight, maxonly=onlymax, excludepromoters=excludepromoters, sepPromEnh=sepPromEnh) saveRDS(featurematrix_dexlps, my_rdsfile) } else{ featurematrix_dexlps <- readRDS(my_rdsfile) } # remove columns that are NA after scaling featurematrix_dexlps_scaled <- featurematrix_dexlps %>% mutate(., across(where(is.numeric), ~(scale(.) %>% as.vector))) %>% dplyr::select(where(not_all_na)) motifdata_aggr_tranval_idx <- featurematrix_dexlps_scaled %>% with(which(seqnames!="chr1" & seqnames!="chr8" & seqnames!="chr9")) new_metric <- get_feature_and_label_metrics (featurematrix_dexlps_scaled, motifdata_aggr_tranval_idx, genenames=genenames) all_metrics <- rbind(all_metrics, new_metric)%>% magrittr::set_rownames(c(rownames(all_metrics),modelname)) #--------------------------LPS--------------------------- modelname <- paste(motifdata,"condition_lps_exclprom",excludepromoters,"onlymax",onlymax,"sepPromEnh",sepPromEnh,"weight",weight, sep="_") print(modelname) my_rdsfile <- here(paste0(opt$featuredir, modelname,".rds")) if(!file.exists(my_rdsfile)){ featurematrix_lps <- merge_motifdata_with_assignments(motifcounts_lps, assignment_lps, contrast_DexLPSvLPS, weightby=weight, maxonly=onlymax, excludepromoters=excludepromoters, sepPromEnh=sepPromEnh) saveRDS(featurematrix_lps,file=my_rdsfile) } else{ featurematrix_lps <- readRDS(my_rdsfile) } featurematrix_lps_scaled <- featurematrix_lps %>% mutate(., across(where(is.numeric), ~(scale(.) %>% as.vector))) %>% dplyr::select(where(not_all_na)) motifdata_aggr_tranval_idx <- featurematrix_lps_scaled %>% with(which(seqnames!="chr1" & seqnames!="chr8" & seqnames!="chr9")) new_metric <- get_feature_and_label_metrics (featurematrix_lps_scaled, motifdata_aggr_tranval_idx, genenames=genenames) all_metrics <- rbind(all_metrics, new_metric)%>% magrittr::set_rownames(c(rownames(all_metrics),modelname)) #-----------------------DIFFERENCE------------------------------ modelname <- paste(motifdata,"condition_DexLPS-LPS_exclprom",excludepromoters,"onlymax",onlymax,"sepPromEnh",sepPromEnh,"weight",weight, sep="_") print(modelname) my_rdsfile <- here(paste0(opt$featuredir, modelname,".rds")) if(!file.exists(my_rdsfile)){ # they contain the same motifs, but not the same genes. # doublecheck that all columnnames are identical table(colnames(featurematrix_dexlps) == colnames(featurematrix_lps)) # motifcounts missing in one condition should be set to 0 merged_featurematrix <- merge(featurematrix_dexlps, featurematrix_lps, by=c("anno","label","seqnames"), all=TRUE) # replace missing counts in one of the conditions with 0 merged_featurematrix[is.na(merged_featurematrix)] <- 0 # use dataframe suffix to grab respective columns featurematrix_diff <- merged_featurematrix[grep(".x$",colnames(merged_featurematrix))] - merged_featurematrix[grep(".y$",colnames(merged_featurematrix))] # tidy up column names colnames(featurematrix_diff) <- gsub(".x$","", colnames(featurematrix_diff)) # add first 3 columns back after computing the difference of the counts featurematrix_diff <- cbind(merged_featurematrix[1:3],featurematrix_diff) saveRDS(featurematrix_diff,file=my_rdsfile) } else{ featurematrix_diff <- readRDS(my_rdsfile) } featurematrix_diff_scaled <- featurematrix_diff %>% mutate(., across(where(is.numeric), ~(scale(.) %>% as.vector))) %>% dplyr::select(where(not_all_na)) # run GLM and look at performance featurematrix_diff_tranval_idx <- featurematrix_diff_scaled %>% with(which(seqnames!="chr1" & seqnames!="chr8" & seqnames!="chr9")) new_metric <- get_feature_and_label_metrics (featurematrix_diff_scaled, featurematrix_diff_tranval_idx, genenames=genenames) all_metrics <- rbind(all_metrics, new_metric)%>% magrittr::set_rownames(c(rownames(all_metrics),modelname)) } } } } } all_metrics <- all_metrics %>% mutate(ratio_pos_inst_train = n_pos_inst_train/(n_pos_inst_train+n_neg_inst_train), ratio_pos_inst_test = n_pos_inst_test/(n_pos_inst_test+n_neg_inst_test) ) # let'S add the number of non-zero coefficients to this model_coefs_joint <- readRDS( opt$model_coefs_joint ) model_coefs_sep <- readRDS( opt$model_coefs_sep ) nonzero_columentries <- rbind ( colSums(model_coefs_joint != 0, na.rm=TRUE) %>% as.data.frame(), colSums(model_coefs_sep != 0, na.rm=TRUE) %>% as.data.frame() ) colnames(nonzero_columentries) <- "n_sel_features_inclintercept" all_metrics <- merge(all_metrics, nonzero_columentries, by="row.names") %>% relocate(n_sel_features_inclintercept, .after = n_input_features ) # parse feature engineering choices from modelname and add them as variables all_metrics <- all_metrics %>% mutate(condition = case_when(grepl("condition_lps", Row.names) ~ "LPS", grepl("condition_DexLPS-LPS", Row.names) ~ "Dex+LPS - LPS", grepl("condition_dexlps", Row.names) ~ "Dex+LPS"), input_region = case_when(grepl("motifcounts_summitregion", Row.names) ~ "GR summitregions", grepl("motifcounts_abcregion", Row.names) ~ "active regions"), excludepromoters = case_when(grepl("exclprom_all", Row.names) ~ "all", grepl("exclprom_onlyNONself", Row.names) ~ "nonself", grepl("exclprom_FALSE", Row.names) ~ "none"), mapping = case_when(grepl("onlymax_FALSE", Row.names) ~ "1-to-many (ABC-based)", grepl("onlymax_TRUE", Row.names) ~ "1-to-1 (ABC-based)"), weight = case_when(grepl("weight_abcscore", Row.names) ~ "abcscore", grepl("weight_FALSE", Row.names) ~ "none"), prom_and_enh_features = case_when(grepl("sepPromEnh_TRUE", Row.names) ~ "separate", grepl("sepPromEnh_FALSE", Row.names) ~ "aggregate") ) # manually update choices for the reference model all_metrics <- all_metrics %>% rows_update( tibble(Row.names = "prox", condition="Dex+LPS", input_region="GR summitregions", mapping="1-to-1 (proximity-based)", weight="none"), by = "Row.names") summary(all_metrics) summary(all_metrics$ratio_pos_inst_train-all_metrics$ratio_pos_inst_test) write.table(all_metrics, file=opt$outfile, quote=FALSE, sep="\t", row.names = FALSE, col.names = TRUE ) |
Workflow dervied form https://github.com/snakemake-workflows/rna-seq-star-deseq2 for multi condition Deseq2 and enrichment analyis (v0.1.1)
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 | knitr::opts_chunk$set( echo = FALSE, warning = FALSE, message = FALSE, dev = "png", fig.width = 12, fig.height = 12 ) library(purrr) library(clusterProfiler) library(magrittr) library(ggplot2) library(org.Mm.eg.db) library(svglite) if (exists("snakemake")) { gsea_list <- snakemake@input[["gsea_result"]] plot_path <- snakemake@params[["plot_path"]] c_groups <- snakemake@params[["contrast_groups"]] write(unlist(c_groups), file = stderr()) names(gsea_list) <- names(x = c_groups) species_name <- snakemake@config[["organism"]] gene_name_type <- snakemake@config[["gene_name_type"]] gsea_conf <- snakemake@config[["gsea"]] save_plots <- TRUE } else { conf <- yaml::read_yaml("../../../configs/VascAge_Apelin_config.yaml") BASE_DIR <- file.path(conf$dirs$BASE_ANALYSIS_DIR) cond_id <- names(conf$diffexp$contrasts)[1] gsea_list <- as.list(file.path(BASE_DIR, glue::glue("results/diffexp/{cond_id}/{names(conf$diffexp$contrasts[[cond_id]])}.gseares.RDS"))) c_groups <- names(conf$diffexp$contrasts[[cond_id]]) names(gsea_list) <- c_groups # gsea_list <- gsea_list[1:3] plot_path <- "/home/heyer/tmp/" species_name <- "Mus musculus" gene_name_type <- "ENSEMBL" save_plots <- FALSE gsea_conf <- conf$gsea } limit <- 10 |
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 | library(tibble) if (!require(RNAscripts)) { devtools::install("./scripts/RNAscripts", upgrade = "never") } library(RNAscripts) if (packageVersion("mitch") < "1.1.8") { devtools::install_github("markziemann/mitch") } # library(org.Mm.eg.db) library(mitch) library(tidyverse) library(ensembldb) library(AnnotationHub) ## Load the annotation resource. ah <- AnnotationHub() Gtf <- ah["AH28753"] ## Create a EnsDb database file from this. DbFile <- ensDbFromAH(Gtf) ## We can either generate a database package, or directly load the data edb <- EnsDb(DbFile) library(msigdbr) if (exists("snakemake")) { # Input Files diffexp_tables_paths <- snakemake@input[["tables"]] contrast_list <- snakemake@params[["contrasts"]] fpkm_path <- snakemake@input[["fpkm"]] # Output Files mitch_table <- snakemake@output[["mitch_table"]] mitch_rds <- snakemake@output[["mitch_rds"]] report_path <- snakemake@output[["mitch_report"]] # Parameters threads <- snakemake@threads enrichment_term_type <- "SYMBOL" } else { BASE_ANALYSIS_DIR <- "/omics/odcf/analysis/OE0228_projects/VascularAging/rna_sequencing/cre_2022/" contrast_list <- c( "basal_cre_pos_vs_basal_cre_neg", "tumor_cre_pos_vs_tumor_cre_neg", "tumor_cre_pos_vs_basal_cre_pos", "tumor_cre_neg_vs_basal_cre_neg" ) diffexp_tables_paths <- as.list(file.path(BASE_ANALYSIS_DIR, glue::glue("results/diffexp/{contrast_list}.diffexp.tsv"))) fpkm_path <- file.path(BASE_ANALYSIS_DIR, "fpkm/all.tsv") threads <- 1 report_path <- "big_yeeter.html" enrichment_term_type <- "SYMBOL" } ## Read data diff_exp_tables <- purrr::map(diffexp_tables_paths, readr::read_tsv, col_names = c( "gene_id", "baseMean", "logFoldChange", "lfcSE", "stat", "pvalue", "padj" ), skip = 1) names(diff_exp_tables) <- contrast_list diff_exp_frames <- purrr::map(diff_exp_tables, function(x) { gene_ids <- x$gene_id diff_exp_fr <- as.data.frame(x[, -1]) rownames(diff_exp_fr) <- gene_ids diff_exp_fr }) # names(diff_exp_frames) <- contrast_list Read FPKM file fpkm <- readr::read_tsv(fpkm_path) match_table <- fpkm %>% dplyr::select(c("gene", "gname")) match_vec <- setNames(object = match_table$gname, nm = match_table$gene) ### Setupt mitch input data prep_mitch_input <- function(deseq_list, e_term = "ENSEMBL") { # mitch_input_df <- mitch::mitch_import(diff_exp_frames, 'DESeq2', geneTable = as.data.frame(fpkm[,c('gene', # 'gname')])) mitch_input_df <- mitch::mitch_import(deseq_list, "DESeq2") gname_tb <- tibble::tibble(Transcript_id = rownames(mitch_input_df)) if (e_term == "ENSEMBL") { gname_tb["gene_id"] <- stringr::str_extract(gname_tb$Transcript_id, "ENSMUSG[0-9]*") } else if (e_term == "SYMBOL") { gname_tb["gene_id"] <- match_vec[gname_tb$Transcript_id] } rownames(mitch_input_df) <- gname_tb$gene_id colnames(mitch_input_df) <- stringr::str_remove_all(colnames(mitch_input_df), pattern = "-") ifelse(anyDuplicated(gname_tb$gene_id), warning("Duplicated genes in mitch input dataframe."), print("no duplicates")) mitch_input_df } #' Uniquify a vector #' #' @param x Input factor to check #' @return corrected facotr value #' @examples #' NULL uniquify <- function(x) { while (anyDuplicated(x)) { x[duplicated(x)] <- paste0(x[duplicated(x)], "_1") } x } mitch_input_df <- prep_mitch_input(diff_exp_frames) # mm_reactome <- buildReactome(output_type = enrichment_term_type) ensembl_reactome <- buildReactome(output_type = # 'ENSEMBL') if (enrichment_term_type == "SYMBOL") { # Get GENE ids new_ids <- ensembldb::select(edb, keys = rownames(mitch_input_df), keytype = "GENEID", columns = c("SYMBOL", "GENEID")) update_vector <- setNames(new_ids$SYMBOL, nm = new_ids$GENEID) # Stop if any duplicated genes in reactome gene set names(match_vec) <- stringr::str_extract(names(match_vec), "ENSMUSG[0-9]*") geneIDs <- match_vec[rownames(mitch_input_df)] geneIDs[names(update_vector)] <- update_vector col_names <- c("gs_name", "gene_symbol") mm_msigdbr <- msigdbr::msigdbr(species = "Mus musculus", category = "H") %>% dplyr::select(col_names) msigdb_gene_list <- split(x = mm_msigdbr$gene_symbol, f = mm_msigdbr$gs_name) ensembl_in_msigdb <- new_ids %>% dplyr::filter(SYMBOL %in% mm_msigdbr$gene_symbol) %>% pull(GENEID) ## Cleanup duplicates mitch_input_df <- mitch_input_df[(rownames(mitch_input_df) %in% ensembl_in_msigdb) | !(duplicated(geneIDs[rownames(mitch_input_df)]) | duplicated(geneIDs[rownames(mitch_input_df)], fromLast = T)), ] geneIDs <- match_vec[rownames(mitch_input_df)] # geneIDs[names(update_vector)] <- update_vector dup_mitch <- mitch_input_df[] %>% as_tibble(rownames = "ENSEMBL") dup_mitch["SYMBOL"] <- geneIDs[dup_mitch$ENSEMBL] # Remove samples which(dup_mitch$ENSEMBL %in% unlist(ensembl_reactome)) final_name_vec <- setNames(dup_mitch$SYMBOL, nm = dup_mitch$ENSEMBL) final_name_vec <- final_name_vec[-which(!(final_name_vec %in% mm_msigdbr$gene_symbol) & duplicated(final_name_vec))] # final_name_vec['ENSMUSG00000051396'] <- 'Gm45902' if (anyDuplicated(final_name_vec)) { stop("Duplicate in gene names") } # geneIDs <- uniquify(geneIDs) mitch_input_df <- mitch_input_df[names(final_name_vec), ] rownames(mitch_input_df) <- final_name_vec } ## Setup Databases to test msig_hallmarck <- msigdbr::msigdbr(species = 'Mus musculus') # bain <- msig_hallmarck %>% dplyr::select(c('gs_name', 'gene_symbol')) %>% group_split(gs_name) # more_bain <- purrr::map(bain, function(x){x$gene_symbol}) # mitch::mitch_calc(mitch_input_df, more_bain, cores = 6, priority ='significance') # mm_reactome <- buildReactome(output_type = enrichment_term_type) print(colnames(mitch_input_df)) # print(anyDuplicated(colnames(mitch_input_df))) reac_test <- mitch::mitch_calc(mitch_input_df, msigdb_gene_list, cores = threads) readr::write_tsv(reac_test$enrichment_result, mitch_table) saveRDS(reac_test, file = mitch_rds) mitch::mitch_report(reac_test, outfile = report_path) |
R
Snakemake
tidyverse
DESeq2
org.Mm.eg.db
tibble
AnnotationHub
msigdbr
ensembldb
mitch
From
line
4
of
scripts/run_mitch.R
Snakemake pipeline for Gene Set Enrichment: pathway and GO enrichment analysis
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | library("edgeR") library(biomaRt) library(dplyr) library("AnnotationDbi") library("org.Hs.eg.db") library(data.table) library("gplots") library(magrittr) library(pathview) library("org.Mm.eg.db") library(gage) library(gageData) library(data.table) args <- commandArgs(trailingOnly = TRUE) dge = args[1] ORGANISM =args[2] compare_type = args[3] outname = args[4] res <- read.csv(file = dge, header = TRUE) summary(res) foldchanges = res$logFC names(foldchanges) = res$entrez outname = paste(outname, compare_type, sep ="-") outGO = paste(outname, "GO.csv", sep ="-") if (ORGANISM == "HUMAN") { data(kegg.sets.hs) data(go.sets.hs) data(carta.hs) data(sigmet.idx.hs) data(go.subs.hs) kegg.sets.hs = kegg.sets.hs[sigmet.idx.hs] head(kegg.sets.hs,3) #GO data(go.sets.hs) data(go.subs.hs) gobpsets = go.sets.hs[go.subs.hs$BP] gobpres = gage(foldchanges, gsets=kegg.sets.hs, same.dir =TRUE, compare =compare_type,cutoff=0.05) lapply(gobpres, head) write.csv(gobpres, outGO) } else if (ORGANISM == "MOUSE"){ data(kegg.sets.mm) data(go.sets.mm) data(carta.mm) data(sigmet.idx.mm) data(go.subs.mm) kegg.sets.mm = kegg.sets.mm[sigmet.idx.mm] head(kegg.sets.mm,3) data(go.sets.mm) data(go.subs.mm) gobpsets = go.sets.mm[go.subs.mm$BP] gobpres = gage(foldchanges, gsets=kegg.sets.mm, same.dir =TRUE, compare =compare_type,cutoff=0.05) lapply(gobpres, head) write.csv(gobpres, outGO) } |
R
dplyr
data.table
org.Hs.eg.db
org.Mm.eg.db
magrittr
biomaRt
gplots
AnnotationDbi
gage
From
line
1
of
scripts/GO.R
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 | library("edgeR") library(biomaRt) library(dplyr) library("AnnotationDbi") library("org.Hs.eg.db") library(data.table) library("gplots") library(magrittr) library(pathview) library("org.Mm.eg.db") library(gage) library(gageData) library(data.table) args <- commandArgs(trailingOnly = TRUE) dge = args[1] ORGANISM =args[2] compare_type = args[3] outname = args[4] outname res <- read.csv(file = dge, header = TRUE) outname = paste(outname, compare_type, sep ="-") out = paste(outname, "KEGG.csv", sep="-") outUP = paste(outname, "KEGG_UP.csv", sep ="-") outDOWN = paste(outname, "KEGG_DOWN.csv", sep ="-") foldchanges = res$logFC names(foldchanges) <- res$entrez summary(foldchanges) #--------------------------- #KEGG Analysis #--------------------------- if (ORGANISM == "HUMAN") { data(kegg.sets.hs) data(go.sets.hs) data(carta.hs) data(sigmet.idx.hs) data(go.subs.hs) kegg.sets.hs = kegg.sets.hs[sigmet.idx.hs] head(kegg.sets.hs,3) #---------------------------------------------------Use Kegg and gage to get upregulated and downregulated pathways data(kegg.gs) keggres = gage(foldchanges, gsets =kegg.sets.hs, same.dir = TRUE, compare=compare_type,make.plot = TRUE) lapply(keggres, head) write.csv(keggres,out) keggrespathwaysup = data.frame(id=rownames(keggres$greater), keggres$greater) %>% tbl_df() %>% filter(row_number()<=30) %>% .$id %>% as.character() keggrespathwaysdn = data.frame(id=rownames(keggres$less), keggres$less) %>% tbl_df() %>% filter(row_number()<=30) %>% .$id %>% as.character() write.csv(keggrespathwaysup, outUP) write.csv(keggrespathwaysdn, outDOWN) #------------------------------------------------------------------------------------------------------------------------------- keggresidsup = substr(keggrespathwaysup, start=1, stop=8) keggresidsup keggresidsdn = substr(keggrespathwaysdn, start=1, stop=8) #gobpres = gage(foldchanges, gsets=kegg.sets.hs, same.dir =FALSE, compare ="paired",cutoff=0.05) #lapply(gobpres, head) #---------------------------------------------------------Define plotting function for applying later plot_pathway = function(pid) pathview(gene.data=foldchanges,gene.idtype="ENTREZID", pathway.id=pid, species="human", new.signature=FALSE) #---------------------------------------------------------plot multiple pathways ups and downs tmpup = sapply(keggresidsup, function(pid) pathview(gene.data=foldchanges,gene.idtype="ENTREZID", pathway.id=pid, species="human")) tmpdn = sapply(keggresidsdn, function(pid) pathview(gene.data=foldchanges,gene.idtype="ENTREZID", pathway.id=pid, species="human")) } else if (ORGANISM == "MOUSE"){ data(kegg.sets.mm) data(go.sets.mm) data(carta.mm) data(sigmet.idx.mm) data(go.subs.mm) kegg.sets.mm = kegg.sets.mm[sigmet.idx.mm] head(kegg.sets.mm,3) #---------------------------------------------------Use Kegg and gage to get upregulated and downregulated pathways data(kegg.gs) keggres = gage(foldchanges, gsets =kegg.sets.mm, same.dir = TRUE, compare=compare_type,make.plot = TRUE) lapply(keggres, head) write.csv(keggres,out) keggrespathwaysup = data.frame(id=rownames(keggres$greater), keggres$greater) %>% tbl_df() %>% filter(row_number()<=30) %>% .$id %>% as.character() keggrespathwaysdn = data.frame(id=rownames(keggres$less), keggres$less) %>% tbl_df() %>% filter(row_number()<=30) %>% .$id %>% as.character() write.csv(keggrespathwaysup, outUP) write.csv(keggrespathwaysdn, outDOWN) #------------------------------------------------------------------------------------------------------------------------------- keggresidsup = substr(keggrespathwaysup, start=1, stop=8) keggresidsup keggresidsdn = substr(keggrespathwaysdn, start=1, stop=8) #gobpres = gage(foldchanges, gsets=kegg.sets.mm, same.dir =FALSE, compare ="paired",cutoff=0.05) #lapply(gobpres, head) #---------------------------------------------------------Define plotting function for applying later plot_pathway = function(pid) pathview(gene.data=foldchanges,gene.idtype="ENTREZID", pathway.id=pid, species="mouse", new.signature=FALSE) #---------------------------------------------------------plot multiple pathways ups and downs tmpup = sapply(keggresidsup, function(pid) pathview(gene.data=foldchanges,gene.idtype="ENTREZID", pathway.id=pid, species="mouse")) tmpdn = sapply(keggresidsdn, function(pid) pathview(gene.data=foldchanges,gene.idtype="ENTREZID", pathway.id=pid, species="mouse")) } |
R
dplyr
data.table
org.Hs.eg.db
org.Mm.eg.db
magrittr
biomaRt
gplots
AnnotationDbi
gage
pathview
From
line
1
of
scripts/pathway.R
Differential Gene Expression using RNA-Seq
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 | library(knitr) knitr::opts_chunk$set(echo = TRUE) library(DESeq2) library(ggplot2) library(knitr) library(clusterProfiler) library(biomaRt) library(ReactomePA) library(DOSE) library(KEGG.db) library(org.Mm.eg.db) library(org.Hs.eg.db) library(pheatmap) library(genefilter) library(RColorBrewer) library(GO.db) library(topGO) library(dplyr) library(gage) library(ggsci) |
472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 | source("https://bioconductor.org/biocLite.R") biocLite("DESeq2") ; library(DESeq2) biocLite("ggplot2") ; library(ggplot2) biocLite("clusterProfiler") ; library(clusterProfiler) biocLite("biomaRt") ; library(biomaRt) biocLite("ReactomePA") ; library(ReactomePA) biocLite("DOSE") ; library(DOSE) biocLite("KEGG.db") ; library(KEGG.db) biocLite("org.Mm.eg.db") ; library(org.Mm.eg.db) biocLite("org.Hs.eg.db") ; library(org.Hs.eg.db) biocLite("pheatmap") ; library(pheatmap) biocLite("genefilter") ; library(genefilter) biocLite("RColorBrewer") ; library(RColorBrewer) biocLite("GO.db") ; library(GO.db) biocLite("topGO") ; library(topGO) biocLite("dplyr") ; library(dplyr) biocLite("gage") ; library(gage) biocLite("ggsci") ; library(ggsci) |
data / bioconductor
org.Mm.eg.db
Genome wide annotation for Mouse: Genome wide annotation for Mouse, primarily based on mapping using Entrez Gene identifiers.