Code used for the manuscript 'Network reconstruction for trans acting genetic loci using multi-omics data and prior information' by Hawe et al., 2022 in Genome Medicine

public public 1yr ago 0 bookmarks

Snakemake pipeline: cohort study, simulation and benchmark

The snakemake workflow first defines a set of trans-QTL hotspots from available QTL results (i.e. genetic variants with 5+ trans associations). For each hotspot,

Code Snippets

  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
print("Load libraries and scripts.")
# ------------------------------------------------------------------------------
library(graph)
library(rtracklayer)
library(data.table)

source("scripts/lib.R")

# ------------------------------------------------------------------------------
print("Get snakemake params.")
# ------------------------------------------------------------------------------
fbinding_sites_remap <- snakemake@input$tfbs_remap
fbinding_sites_encode <- snakemake@input$tfbs_encode
fgene_annot <- snakemake@input$gene_annot

ftfbs_annot <- snakemake@output$tfbs_annot

# ------------------------------------------------------------------------------
print("Start processing.")
# ------------------------------------------------------------------------------
ga <- load_gene_annotation(fgene_annot)
tss <- promoters(ga, 1000, 1000)
names(tss) <- tss$SYMBOL

# ------------------------------------------------------------------------------
print("Creating TF-TSS annotation.")
# ------------------------------------------------------------------------------

#' Creates an annotation object, mapping TFBS to TSS
#'
#' Loads all available TFBS collected from public sources (Encode, Remap) and
#' overlaps those with the provided TSS.
#' Code adapted from file R/annotate-cpgs.R.
#'
#'
#' @author Johann Hawe <[email protected]>
#'
annotate_tfbs_to_tss <- function(fbinding_sites_remap,
                                 fbinding_sites_encode,
                                 tss) {
  # get the TFBS regions from remap
  tfbs = import(fbinding_sites_remap)
  ann = t(matrix(unlist(strsplit(values(tfbs)[,"name"], ".", fixed=T)), nrow=3))
  colnames(ann) = c("geo_id", "TF", "condition")
  values(tfbs) = DataFrame(name=values(tfbs)[,"name"],
                           data.frame(ann, stringsAsFactors=F))

  # we write out a table with all conditions and select the blood related ones
  conditions = t(matrix(unlist(strsplit(unique(values(tfbs)[,"name"]), ".",
                                        fixed=T)), nrow=3))
  colnames(conditions) = c("geo_id", "TF", "condition")
  conditions = conditions[order(conditions[,"condition"]),]
  conditions = conditions[,c(1,3)]
  conditions = conditions[!duplicated(paste(conditions[,1], conditions[,2])),]
  conditions = data.frame(conditions, blood.related=F)
  for (term in c("amlpz12_leukemic", "aplpz74_leukemia",
                 "bcell", "bjab", "bl41",
                 "blood", "lcl", "erythroid", "gm",
                 "hbp", "k562", "kasumi",
                 "lymphoblastoid", "mm1s", "p493",
                 "plasma", "sem", "thp1", "u937")) {
    conditions[grep(term, conditions[,2]),"blood.related"] = TRUE
  }

  # select the appropriate blood related TFBS subset
  selected = tfbs[values(tfbs)[,"condition"] %in%
                    conditions[conditions[,"blood.related"],"condition"]]

  # load the encode tfs separately
  encode = as.data.frame(fread(fbinding_sites_encode, header=F))
  encode = with(encode, GRanges(seqnames=V1, ranges=IRanges(V2 + 1, V3),
                                name=paste("ENCODE", V4, tolower(V6), sep="."),
                                geo_id="ENCODE", TF=V4,
                                condition=tolower(V6)))

  # filter blood related cell lines
  encode.lcl = encode[grep("gm", values(encode)[,"condition"])]
  values(encode.lcl)[,"condition"] = "lcl"
  encode.k562 = encode[grep("k562", values(encode)[,"condition"])]
  values(encode.k562)[,"condition"] = "k562"

  # combine remap and encode TFBS
  selected = c(selected, encode.lcl, encode.k562)

  # create an annotation matrix for the TSS
  chip = paste(values(selected)[,"TF"], values(selected)[,"condition"], sep=".")
  chip_exp = unique(chip)

  tfbs_ann = sapply(chip_exp, function(x) overlapsAny(tss,
                                                      selected[chip == x]))
  rownames(tfbs_ann) = names(tss)

  return(tfbs_ann)
}

tfbs_annot <- annotate_tfbs_to_tss(fbinding_sites_remap,
                                   fbinding_sites_encode,
                                   tss)
# ------------------------------------------------------------------------------
print("Saving results.")
# ------------------------------------------------------------------------------
saveRDS(tfbs_annot, file=ftfbs_annot)

# ------------------------------------------------------------------------------
print("Session info:")
# ------------------------------------------------------------------------------
sessionInfo()
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

# ------------------------------------------------------------------------------
print("Prepare libraries and source scripts.")
# ------------------------------------------------------------------------------
library(pheatmap)
suppressPackageStartupMessages(library(GenomicRanges))
library(igraph)
library(graph)
library(reshape2)
source("scripts/lib.R")
source("scripts/reg_net.R")
source("scripts/reg_net_utils.R")

# ------------------------------------------------------------------------------
print("Get snakemake parameters.")
# ------------------------------------------------------------------------------

#input
franges <- snakemake@input[["ranges"]]
fdata <- snakemake@input[["data"]]
fppi_db <- snakemake@input[["ppi_db"]]
fpriors <- snakemake@input[["priors"]]
fcpg_context <- snakemake@input[["cpg_context"]]
ftss_context <- snakemake@input[["tss_context"]]

# output
fout <- snakemake@output$fit
fsummary_plot <- snakemake@output$summary_file

# params
threads <- snakemake@threads

# ------------------------------------------------------------------------------
print("Load and prepare data.")
# ------------------------------------------------------------------------------
data <- readRDS(fdata)

# remove (rare) all-NA cases. This can happen due to scaling of all-zero entities,
# which can arise due to a very large number of cis-meQTLs which effects get
# removed from the CpGs during data preprocessing.
# NOTE: we could possibly handle this differently? Seems that these particular
# cpgs are highly influenced by genetic effects?
use <- apply(data,2,function(x) (sum(is.na(x)) / length(x)) < 1)
data <- data[,use]

print("Dimensions of data:")
print(dim(data))

priors <- readRDS(fpriors)

# filter for available data in priros
priors <- priors[colnames(data), colnames(data)]
ranges <- readRDS(franges)

# load PPI DB
ppi_db <- readRDS(fppi_db)

# ------------------------------------------------------------------------------
# for catching the genenet summary plots, we open the pdf connection here
# ------------------------------------------------------------------------------
pdf(fsummary_plot)

# ------------------------------------------------------------------------------
print("Infer regulatory networks.")
# ------------------------------------------------------------------------------
if(ranges$seed == "meqtl") {
  fcontext <- fcpg_context
} else {
  fcontext <- ftss_context
}

result <- infer_all_graphs(data, priors, ranges, fcontext, ppi_db,
                           threads)

dev.off()

# ------------------------------------------------------------------------------
print("All done. Saving results.")
# ------------------------------------------------------------------------------
saveRDS(file=fout, result)

# ------------------------------------------------------------------------------
print("Session info:")
# ------------------------------------------------------------------------------
sessionInfo()
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

# ------------------------------------------------------------------------------
print("Prepare libraries and source scripts.")
# ------------------------------------------------------------------------------
library(pheatmap)
suppressPackageStartupMessages(library(GenomicRanges))
library(igraph)
library(graph)
library(reshape2)
source("scripts/lib.R")
source("scripts/reg_net.R")
source("scripts/reg_net_utils.R")

# ------------------------------------------------------------------------------
print("Get snakemake parameters.")
# ------------------------------------------------------------------------------

#input
franges <- snakemake@input$ranges
fdata_kora <- snakemake@input$data_kora
fdata_lolipop <- snakemake@input$data_lolipop
fppi_db <- snakemake@input$ppi_db
fpriors <- snakemake@input$priors
fcpg_context <- snakemake@input$cpg_context
ftss_context <- snakemake@input$tss_context

# output
fout <- snakemake@output$fit
fsummary_plot <- snakemake@output$summary_file

# params
threads <- snakemake@threads

# ------------------------------------------------------------------------------
print("Load and prepare data.")
# ------------------------------------------------------------------------------
remove_all_na <- function(data) {
  # remove (rare) all-NA cases. This can happen due to scaling of all-zero entities,
  # which can arise due to a very large number of cis-meQTLs which effects get
  # removed from the CpGs during data preprocessing.
  # NOTE: we could possibly handle this differently? Seems that these particular
  # cpgs are highly influenced by genetic effects?
  use <- apply(data, 2, function(x)
    (sum(is.na(x)) / length(x)) < 1)
  data <- data[, use]
  data
}

data_kora <- remove_all_na(readRDS(fdata_kora))
data_lolipop <- remove_all_na(readRDS(fdata_lolipop))

print("Dimensions of KORA data:")
print(dim(data_kora))

print("Dimensions of LOLIPOP data:")
print(dim(data_lolipop))

# we only look at replication, so we only consider the nodes present in both
# cohorts (only for 4 sentinels, there is one less node in lolipop, e.g. LINC* genes)
common_nodes <-
  intersect(colnames(data_kora), colnames(data_lolipop))
data_kora <- data_kora[, common_nodes]
data_lolipop <- data_lolipop[, common_nodes]

# filter for available data in priros
priors <- readRDS(fpriors)
priors <- priors[common_nodes, common_nodes]
ranges <- readRDS(franges)

# load PPI DB
ppi_db <- readRDS(fppi_db)

if (ranges$seed == "meqtl") {
  fcontext <- fcpg_context
} else {
  fcontext <- ftss_context
}

# ------------------------------------------------------------------------------
# for catching the genenet summary plots, we open the pdf connection here
# ------------------------------------------------------------------------------
pdf(fsummary_plot)

# ------------------------------------------------------------------------------
# We add different levels of noise and infer graphs for all scenarios
print("Infer regulatory networks.")
# ------------------------------------------------------------------------------

# helper to get a noisy prior matrix
noisify_priors <- function(priors, noise_level) {
  if (noise_level == 0)
    return(priors)

  PSEUDO_PRIOR <- 1e-7
  number_edges_with_prior <-
    sum(priors[upper.tri(priors)] > PSEUDO_PRIOR)
  number_edges_without_prior <-
    sum(priors[upper.tri(priors)] == PSEUDO_PRIOR)

  number_entries_to_switch <-
    round(noise_level * number_edges_with_prior)

  # rare cases where we have more prior annotated edges than no-prior edges
  if (number_entries_to_switch > number_edges_without_prior) {
    number_entries_to_switch <- number_edges_without_prior
  }

  prior_idx <-
    sample(which(upper.tri(priors) & priors > PSEUDO_PRIOR),
           number_entries_to_switch)
  non_prior_idx <-
    sample(which(upper.tri(priors) & priors == PSEUDO_PRIOR),
           number_entries_to_switch)

  # swap idxs
  temp <- priors[prior_idx]
  priors[prior_idx] <- priors[non_prior_idx]
  priors[non_prior_idx] <- temp

  # make symmetric
  priors[lower.tri(priors)] <- t(priors)[lower.tri(priors)]

  return(priors)
}

# include 0 noise ('normal' model)
noise_levels <- seq(0, 0.8, by = 0.2)
result <- lapply(noise_levels, function(noise_level) {

  print(paste0("Current noise level: ", noise_level))

  priors <- noisify_priors(priors, noise_level)

  result_kora <-
    infer_all_graphs(data_kora, priors, ranges, fcontext, ppi_db,
                     threads)
  result_lolipop <-
    infer_all_graphs(data_lolipop, priors, ranges, fcontext, ppi_db,
                     threads)

  list(kora = result_kora, lolipop = result_lolipop, priors = priors)
})
names(result) <- paste0("noise_level_", noise_levels)
dev.off()

# ------------------------------------------------------------------------------
print("All done. Saving results.")
# ------------------------------------------------------------------------------
saveRDS(file = fout, result)

# ------------------------------------------------------------------------------
print("Session info:")
# ------------------------------------------------------------------------------
sessionInfo()
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

# ------------------------------------------------------------------------------
print("Load libraries and source scripts")
# ------------------------------------------------------------------------------
library(microbenchmark)
library(ggplot2)
library(dplyr)
library(cowplot)
theme_set(theme_cowplot() + background_grid())

benchmark_number_iterations <- snakemake@params$benchmark_number_iterations

# ------------------------------------------------------------------------------
print("Loading data")
# ------------------------------------------------------------------------------
result_files <- snakemake@input
data <- lapply(result_files, function(input_file) {
  as_tibble(readRDS(input_file))
}) %>% bind_rows

# overview plot of results

# define colors
paired <- RColorBrewer::brewer.pal(4, "Paired")
names(paired) <- c("glasso", "glasso (priors)", "bdgraph", "bdgraph (priors)")
unpaired <- RColorBrewer::brewer.pal(7, "Dark2")[c(2,3,7)]
names(unpaired) <- c("irafnet", "genie3", "genenet")
graph_cols <- c(paired, unpaired)

# we need slightly nicer names
data <- mutate(data, expr=gsub("_priors"," (priors)", expr))

gp <- ggplot(data,
             aes(x = reorder(expr, -(time)), y = log10(time), color=expr)) +
  scale_y_log10() +
  geom_boxplot() +
  facet_grid(rows = vars(sample_size), cols = vars(number_of_nodes)) + 
  scale_color_manual(values = graph_cols) +
  theme(axis.text.x = element_text(angle = -45, hjust = 0, vjust = 0)) +
  theme(panel.border = element_rect(fill = NA, size = 1, color = "lightgrey")) +
  labs(
    x = "",
    y = "log10(time in seconds)",
    title = paste0(
      "Benchmark results for ",
      benchmark_number_iterations,
      " iterations."
    ), 
    subtitle = "Columns show number of input nodes, rows sample size",
    color = "model"
  )
gp

save_plot(filename = snakemake@output$overview_plot, gp, 
          ncol = 2 , nrow = 2)

# ------------------------------------------------------------------------------
print("SessionInfo:")
# ------------------------------------------------------------------------------
devtools::session_info()
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

# ------------------------------------------------------------------------------
print("Load libraries and source scripts")
# ------------------------------------------------------------------------------
library(microbenchmark)
# needed to simulate data
library(BDgraph)

source("scripts/lib.R")
source("scripts/reg_net.R")
source("scripts/reg_net_utils.R")
source("scripts/benchmark_methods.R")

threads <- snakemake@threads

RhpcBLASctl::omp_set_num_threads(1)
RhpcBLASctl::blas_set_num_threads(1)

simulation_number_of_nodes <-
  as.numeric(snakemake@wildcards$number_nodes)

simulation_sample_size <- 
  as.numeric(snakemake@wildcards$sample_size)

benchmark_number_iterations <-
  snakemake@params$benchmark_number_iterations

model <- snakemake@wildcards$model

# ------------------------------------------------------------------------------
print("Preparing benchmark input functions.")
# ------------------------------------------------------------------------------
benchmark_input <- list()

is_model_with_prior <-
  ifelse(model %in% c("glasso", "bdgraph"),
         TRUE, FALSE)

# only one where we can use  priors but without 'no prior' version
if (model == "irafnet") {
  benchmark_input[[model]] <-
    quote(reg_net(s$data, s$priors, model,
                  threads = threads))


} else {
  if (is_model_with_prior) {
    benchmark_input[[paste0(model, "_priors")]] <-
      quote(reg_net(s$data, s$priors, model,
                    threads = threads))

    benchmark_input[[model]] <-
      # we set 'use_gstart' in case it is bdgraph
      quote(reg_net(
        s$data,
        NULL,
        model,
        use_gstart = F,
        threads = threads
      ))

  } else {
    benchmark_input[[model]] <-
      quote(reg_net(s$data, NULL, model,
                    threads = threads))

  }
}


# ------------------------------------------------------------------------------
print("Performing benchmark.")
# ------------------------------------------------------------------------------

benchmark_results <-   microbenchmark(
  list = benchmark_input,
  times = benchmark_number_iterations,
  unit = "s",
  setup = {
    s <-
      simulate_data(simulation_number_of_nodes,
                    simulation_sample_size)
  }
)

# ------------------------------------------------------------------------------
print("Benchmark done. Finishing up.")
# ------------------------------------------------------------------------------
benchmark_results$sample_size <- simulation_sample_size
benchmark_results$number_of_nodes <- simulation_number_of_nodes

saveRDS(benchmark_results, file = snakemake@output$result_table)

# ------------------------------------------------------------------------------
print("Report warnings:")
# ------------------------------------------------------------------------------
warnings()

# ------------------------------------------------------------------------------
print("SessionInfo:")
# ------------------------------------------------------------------------------
devtools::session_info()
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

# ------------------------------------------------------------------------------
print("Load libraries and source scripts")
# ------------------------------------------------------------------------------
library(plsgenomics)
library(pheatmap)
library(ggplot2)
library(reshape2)
library(cowplot)
theme_set(theme_cowplot())

source("scripts/lib.R")
source("scripts/collect_data_methods.R")

# ------------------------------------------------------------------------------
print("Get snakemake params")
# ------------------------------------------------------------------------------
# input
fcohort_data <- snakemake@input$cohort_data
ftfbs_annot <- snakemake@input$tfbs_annot

# output
fout_plot <- snakemake@output$plot
fout_tfa <- snakemake@output$tfa
fout_expr <- snakemake@output$expr

cohort <- snakemake@wildcards$cohort

# ------------------------------------------------------------------------------
print("Loading data.")
# ------------------------------------------------------------------------------
load(fcohort_data)

# get df containing only expr covariates and the expr probes
# needed for covariate removal method
expr_covars <- cbind.data.frame(expr,
                                covars[,c("age", "sex", "RIN",
                                          "batch1", "batch2")])
probe_resid <- rm_covariate_effects(expr_covars, "expr")

# we need one expression value per sample per gene -> summarize probes belonging
# to one gene
all_syms <- unique(symbols.from.probeids(colnames(probe_resid)))
symbol_resid <- summarize(probe_resid, all_syms)

# get the tfbs tss annotation
tss_annot <- readRDS(ftfbs_annot)
tfs <- unique(sapply(strsplit(colnames(tss_annot), "\\."), "[[", 1))

# one TF might have the same target measured more than once -> summarize
tss_annot_summarized <- sapply(tfs, function(tf) {
  rowSums(tss_annot[,grepl(paste0(tf, "\\."), colnames(tss_annot)), drop=F])
})

# ------------------------------------------------------------------------------
print("Define annotation and data subsets.")
# ------------------------------------------------------------------------------
# get TFs and their targets
targets <- intersect(rownames(tss_annot), colnames(symbol_resid))
tf_sub <- tfs[tfs %in% colnames(symbol_resid)]

# get the annotation and data subsets
annot_sub <- tss_annot_summarized[targets,,drop=F]
annot_sub <- annot_sub[, tf_sub]
data_sub <- t(symbol_resid[, targets])

# ------------------------------------------------------------------------------
print("Estimating TFAs using PLS/SIMPLS and substituting.")
# ------------------------------------------------------------------------------
TFA <- t(TFA.estimate(annot_sub, data_sub)$TFA)
colnames(TFA) <- colnames(annot_sub)
rownames(TFA) <- colnames(data_sub)

# substitute expression of TFs with TFA
symbol_resid_tfa <- symbol_resid
symbol_resid_tfa[,tf_sub] <- TFA[,tf_sub]

# ------------------------------------------------------------------------------
print("Saving results.")
# ------------------------------------------------------------------------------
saveRDS(file=fout_tfa, symbol_resid_tfa)
saveRDS(file=fout_expr, symbol_resid)

# ------------------------------------------------------------------------------
print("Getting correlations of TFs/Targets and plotting.")
# ------------------------------------------------------------------------------

# get plotting data frame
data <- sapply(tf_sub, function(g) { 
  cor(symbol_resid_tfa[,g], symbol_resid[,g])
})

toplot <- data.frame(correlation=unlist(data))

pdf(fout_plot)
ggplot(toplot, aes(x=correlation)) +
  geom_histogram() +
  labs(title=paste0("Correlation between TFA and Expression in ", cohort))
dev.off()

# ------------------------------------------------------------------------------
print("SessionInfo:")
# ------------------------------------------------------------------------------
sessionInfo()
sink()
sink(type="message")
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")


# ------------------------------------------------------------------------------
print("Load libraries, source scripts.")
# ------------------------------------------------------------------------------
suppressPackageStartupMessages(library(GenomicRanges))
source("scripts/lib.R")

# ------------------------------------------------------------------------------
print("Get snakemake params.")
# ------------------------------------------------------------------------------

# input
franges <- snakemake@input[["ranges"]]
fkora_data <- snakemake@input[["kora"]]
flolipop_data <- snakemake@input[["lolipop"]]
fkora_activities <- snakemake@input$kora_activities
flolipop_activities <- snakemake@input$lolipop_activities
fccosmo <- snakemake@input[["ccosmo"]]
fceqtl <- snakemake@input[["ceqtl"]]

# params
cohort <- snakemake@wildcards$cohort
sentinel <- snakemake@wildcards$sentinel
seed <- snakemake@wildcards$seed
tfa_or_expr <- snakemake@params$tfa_or_expr

print(paste0("Using '", tfa_or_expr, "' for gene measurements"))

# we source a different methods script in case we get TFA
if("tfa" %in% tfa_or_expr) {
  source("scripts/collect_data_methods_tfa.R")
} else {
  source("scripts/collect_data_methods.R")
}

# output
fout <- snakemake@output[[1]]
fout_raw <- snakemake@output[[2]]

print(paste0("Sentinel is ", sentinel, "."))

# ------------------------------------------------------------------------------
print("Prepare probe and snp ids.")
# ------------------------------------------------------------------------------
ranges <- readRDS(franges)

# get expression probes
expr_probes = c(unlist(ranges$snp_genes$ids),
                unlist(ranges$cpg_genes$ids),
                unlist(ranges$tfs$ids),
                unlist(ranges$spath$ids),
                unlist(ranges$trans_genes$ids))
expr_probes <- unique(expr_probes)

# get expr gene symbols (for TFA based data)
expr_syms = c(unlist(ranges$snp_genes$SYMBOL),
                unlist(ranges$cpg_genes$SYMBOL),
                unlist(ranges$tfs$SYMBOL),
                unlist(ranges$spath$SYMBOL),
                unlist(ranges$trans_genes$SYMBOL))
expr_syms <- unique(expr_syms)

# get methylation probes
meth_probes <- names(ranges$cpgs)

# ------------------------------------------------------------------------------
print("Loading cohort data.")
# ------------------------------------------------------------------------------
if("kora" %in% cohort) {
  load(fkora_data)
  if("tfa" %in% tfa_or_expr) {
    acts <- readRDS(fkora_activities)
  }
} else if ("lolipop" %in% cohort) {
  load(flolipop_data)
  if("tfa" %in% tfa_or_expr) {
    acts <- readRDS(flolipop_activities)
  }
} else {
  stop("Cohort not supported.")
}

# ------------------------------------------------------------------------------
print("Create merged data frame.")
# ------------------------------------------------------------------------------
if("tfa" %in% tfa_or_expr) {
  print("Collecting TFA based data.")
  genes <- colnames(acts)[colnames(acts) %in% expr_syms]
  data <- cbind.data.frame(covars,
                           acts[,genes, drop=F],
                           meth[,colnames(meth) %in% meth_probes, drop=F],
                           geno[,colnames(geno) %in% sentinel, drop=F],
                           geno_ids=rownames(geno),
                           stringsAsFactors=F)
} else {
  print("Collecting EXPR based data.")
  data <- cbind.data.frame(covars,
                           expr[,colnames(expr) %in% expr_probes, drop=F],
                           meth[,colnames(meth) %in% meth_probes, drop=F],
                           geno[,colnames(geno) %in% sentinel, drop=F],
                           geno_ids=rownames(geno),
                           stringsAsFactors=F)
}

# ------------------------------------------------------------------------------
print("Saving raw data.")
# ------------------------------------------------------------------------------
saveRDS(file=fout_raw, data)

# remove not needed remaining data frames
rm(expr, meth, covars)
gc()

print(paste0("Data dimensions: ", paste(dim(data), collapse=",")))

# ------------------------------------------------------------------------------
print("Removing covariate effects from raw data.")
# ------------------------------------------------------------------------------
if("tfa" %in% tfa_or_expr) {
  data <- adjust_data(sentinel, ranges, genes, data, geno, fccosmo, fceqtl)  
} else {
  data <- adjust_data(sentinel, ranges, data, geno, fccosmo, fceqtl)
}

# ------------------------------------------------------------------------------
print("Saving adjusted data.")
# ------------------------------------------------------------------------------
saveRDS(file=fout, data)

# ------------------------------------------------------------------------------
print("Session info:")
# ------------------------------------------------------------------------------
sessionInfo()
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

# ------------------------------------------------------------------------------
print("Load libraries and source scripts.")
# ------------------------------------------------------------------------------
library(qvalue)
library(data.table)
library(graph)
library(fdrtool)
library(Homo.sapiens)
library(pheatmap)

source("scripts/lib.R")
source("scripts/priors.R")

# ------------------------------------------------------------------------------
print("Get snakemake params.")
# ------------------------------------------------------------------------------
# input
fgene_priors <- snakemake@input[["gg_priors"]]
feqtlgen_eqtl_priors <- snakemake@input[["eqtlgen_eqtl_priors"]]
fgtex_eqtl_priors <- snakemake@input[["gtex_eqtl_priors"]]
fcpgcontext <- snakemake@input[["cpg_context"]]
ftsscontext <- snakemake@input[["tss_context"]]
fppi <- snakemake@input[["ppi"]]
franges <- snakemake@input[["ranges"]]
fcpg_annot <- snakemake@input[["cpg_annot"]]

# params
sentinel <- snakemake@wildcards$sentinel
eqtl_prior_type <- snakemake@params$eqtl_prior_type

# output
fout <- snakemake@output[[1]]
fplot <- snakemake@output[[2]]

# ------------------------------------------------------------------------------
print("Loading data.")
# ------------------------------------------------------------------------------

print("Loading PPI db.")
ppi_db <- readRDS(fppi)

# load ranges object
ranges <- readRDS(franges)

# get all entities as single vector
nodes <- c(sentinel, ranges$snp_genes$SYMBOL,
           ranges$tfs$SYMBOL, ranges$spath$SYMBOL)

if(ranges$seed == "meqtl") {
  nodes <- c(nodes, with(ranges,
                         c(cpg_genes$SYMBOL, names(cpgs))))
} else {
  nodes <- c(nodes, ranges$trans_genes$SYMBOL)
}
nodes <- unique(nodes)

# ------------------------------------------------------------------------------
print("Load gtex priors, extract link priors.")
# ------------------------------------------------------------------------------
feqtl <- ifelse("eqtlgen" %in% eqtl_prior_type, 
                feqtlgen_eqtl_priors, 
                fgtex_eqtl_priors)
load_eqtl_priors(sentinel, feqtl, eqtl_prior_type)
load_genegene_priors(fgene_priors)

# set appropriate context
if(ranges$seed == "meqtl") {
  fcontext <- fcpgcontext
} else {
  fcontext <- ftsscontext
}

print("Annotation context is:")
print(fcontext)

print("Retrieving link priors.")
pr <- get_link_priors(ranges, nodes, ppi_db, fcontext, 
                      fcpg_annot)

# create a heatmap to be able to look at the priors
pheatmap(filename = fplot, pr, cex=0.7)

# ------------------------------------------------------------------------------
print("Saving data.")
# ------------------------------------------------------------------------------
saveRDS(pr, file=fout)
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

# ------------------------------------------------------------------------------
print("Load packages, source scripts.")
# ------------------------------------------------------------------------------
suppressPackageStartupMessages(library(GenomicRanges))
library(data.table)
library(graph)
source("scripts/lib.R")
source("scripts/collect_ranges_methods.R")

# ------------------------------------------------------------------------------
print("Getting snakemake params.")
# ------------------------------------------------------------------------------
# inputs
feqtl <- snakemake@input$eqtl
fppi <- snakemake@input$ppi
ftfbs_annot <- snakemake@input$tfbs_annot
fgene_annot <- snakemake@input$gene_annot

# output file
fout_ranges <- snakemake@output$ranges
fout_plot <- snakemake@output$plot

# params
sentinel <- snakemake@wildcards$sentinel
tissue <- snakemake@wildcards$tissue

# ------------------------------------------------------------------------------
print("Load and preprocess data.")
# ------------------------------------------------------------------------------

# PPIs
ppi_db <- readRDS(fppi)
ppi_genes <- nodes(ppi_db)

# gene annotation
gene_annot <- load_gene_annotation(fgene_annot)
gene_annot$ids <- probes.from.symbols(gene_annot$SYMBOL,
                                      as_list=T)

# load trans-eQTL
eqtl = fread(feqtl)
eqtl <- eqtl[eqtl$SNP == sentinel,]
if(nrow(eqtl)<5) stop("Will only process hotspots with at least 5 associations.")

# load TFBS annotation
tfbs <- readRDS(ftfbs_annot)

# ------------------------------------------------------------------------------
print("Collecting SNP and gene ranges.")
# ------------------------------------------------------------------------------

# get sentinel region + extended region in which to look for genes
spos <- eqtl[1, c("SNPChr", "SNPPos")]
if(nrow(spos)<1) {
  stop("Couldn't get SNP pos.")
}

sentinel_range <- with(spos,
                       GRanges(paste0("chr", SNPChr),
                               IRanges(SNPPos, width=1)))
sentinel_extrange <- resize(sentinel_range, 1e6, fix="center")
names(sentinel_range) <- names(sentinel_extrange) <- sentinel

# get the regions of the associated trans genes
# we dont have the end position/length of the respective genes in the eqtl
# table, so we use our own annotation
trans_genes <- eqtl$GeneSymbol
trans_genes <- gene_annot[gene_annot$SYMBOL %in% trans_genes]

# could be some are missing. in that case we very likely cannot do anything
# aboutit, since we also won't have any probe ids. if there are too few left,
# we have to abort and report an error
if(length(trans_genes) < 5) {
  stop("Too many trans genes missing in our annotation. Will not collect
       ranges.")
}

# ------------------------------------------------------------------------------
print("Retrieving SNP genes.")
# ------------------------------------------------------------------------------

# get the relevant snp genes by overlapping with our sentinel region
snp_genes <- subsetByOverlaps(promoters(gene_annot),
                               sentinel_extrange,
                               ignore.strand=T)

# ------------------------------------------------------------------------------
print("Collecting TFs and shortest path genes.")
# ------------------------------------------------------------------------------

# load all TFBS we have available in our data and connect with trans-genes
tfbs <- tfbs[rownames(tfbs) %in% trans_genes$SYMBOL,,drop=F]
tfs <- NULL
sp <- NULL
tf_sp <- NULL

tfs_by_transGene <- get_tfs_by_transGene(tfbs, trans_genes, gene_annot)
if(length(tfs_by_transGene) > 0) {
  tfs <- unique(unlist(GenomicRangesList(tfs_by_transGene), use.names=F))
}

# find the shortest path genes between the SNP genes and the annotated TFs
snp_genes_in_ppi <- snp_genes[snp_genes$SYMBOL %in% ppi_genes]

if(length(tfs)<1 | length(snp_genes_in_ppi)<1) {
  warning("No TFs, skipping shortest paths calculation.")
  if(length(tfs) > 0) {
    tf_sp <- tfs
  }
} else {
  print("Getting paths...")
  shortest_paths <- collect_shortest_path_genes(tfs$SYMBOL, trans_genes$SYMBOL,
                                    tfs_by_transGene, ppi_genes,
                                    snp_genes$SYMBOL, ppi_db, gene_annot)
  print("Done collecting paths.")
  sp <- shortest_paths$non_tf_sp
  tf_sp <- shortest_paths$tf_sp

  # adjust the mapping to the transGenes to be consistent
  tfs_by_transGene <- lapply(tfs_by_transGene, function(tf_sub) {
    tf_sub[tf_sub$SYMBOL %in% tf_sp$SYMBOL]
  })
}

# ------------------------------------------------------------------------------
print("Finalizing and saving results.")
# ------------------------------------------------------------------------------
result <-  list(sentinel = sentinel_range,
                snp_genes=snp_genes,
                trans_genes=trans_genes)

if(!is.null(sp)){
  result$spath <- sp
}
if(!is.null(tf_sp)){
  result$tfs <- tf_sp
  result$tfs_by_transGene <- tfs_by_transGene
}

# set seed name
result$seed <- "eqtl"
result$tissue <- tissue

saveRDS(file=fout_ranges, result)

# plot
plot_ranges(result, fout_plot)

# ------------------------------------------------------------------------------
print("Session info:")
# ------------------------------------------------------------------------------
sessionInfo()
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

# ------------------------------------------------------------------------------
# load packages, source scripts
# ------------------------------------------------------------------------------
library(GenomicRanges)
library(GenomicFeatures)
library(FDb.InfiniumMethylation.hg19)
library(data.table)
library(illuminaHumanv3.db)
library(rtracklayer)
library(graph)
library(RBGL) # for shortest paths
library(Matrix)
source("scripts/lib.R")
source("scripts/collect_ranges_methods.R")

# ------------------------------------------------------------------------------
# Get snakemake params
# ------------------------------------------------------------------------------
fcosmo <- snakemake@input$tcosmo
fmeqtl <- snakemake@input$meqtl
fppi_db <- snakemake@input$ppi_db
fprio_tab <- snakemake@input$priorization
fgene_annot <- snakemake@input$gene_annot

# TODO: create this file from scratch!
fcpgcontext <- snakemake@input$cpgcontext

ofile <- snakemake@output[[1]]
sentinel <- snakemake@wildcards$sentinel

# ------------------------------------------------------------------------------
# Load and preprocess data
# ------------------------------------------------------------------------------

print("Loading data.")

gene_annot <- load_gene_annotation(fgene_annot)
gene_annot$ids <- probes.from.symbols(gene_annot$SYMBOL,
                                           as_list=T)
ppi_db <- readRDS(fppi_db)

# load trans-meQTL table
trans_meQTL = read.csv(fmeqtl,
                       sep="\t", stringsAsFactors=F)

# load trans-cosmo information
cosmo <- readRDS(fcosmo)


# load priorization table
prio <- read.table(fprio_tab, sep="\t", header=T, stringsAsFactors = F)

# get trans-genes which should be used for shortest path extraction
prio <- prio[prio$sentinel == sentinel,,drop=F]
if(nrow(prio) > 0) {
  best_trans <- unique(prio$trans.gene)
} else {
  best_trans <- NULL
}

# ------------------------------------------------------------------------------
# Collect and save ranges
# ------------------------------------------------------------------------------

# ------------------------------------------------------------------------------
print("Collecting SNP and CpG ranges.")
# ------------------------------------------------------------------------------

pairs = which(trans_meQTL[,"sentinel.snp"] == sentinel)

# get sentinel idx
sentinel_idx <- which(cosmo$snp==sentinel)[1]

# the large interval range for the sentinel
chr <- paste0("chr", trans_meQTL[pairs,"chr.snp"][1])
start <- trans_meQTL[pairs,"interval.start.snp"][1]
end <- trans_meQTL[pairs,"interval.end.snp"][1]

sentinel_extrange <- GRanges(chr, IRanges(start,end))
sentinel_range <- with(cosmo[sentinel_idx,],
                       GRanges(paste0("chr", snp.chr),
                               IRanges(snp.pos, width=1)))
names(sentinel_range) <- names(sentinel_extrange) <- sentinel

# get cosmo subset
idxs <- get.trans.cpgs(sentinel, trans_meQTL, cosmo)

# get related genes, i.e. genes near meQTL loci (snp+cpg)

# extend cpgs
cosmosub <- cosmo[idxs,]

croi <- with(cosmosub, GRanges(paste0("chr", cpg.chr),
                               IRanges(cpg.pos,width=2)))
names(croi) <- as.character(cosmosub[,"cpg"])
croi <- unique(croi)

# extended sentinel region
sroi <- sentinel_extrange
names(sroi) <- sentinel

# ------------------------------------------------------------------------------
print("Retrieving SNP and CpG genes.")
# ------------------------------------------------------------------------------

# get the relevant snp genes by overlapping with our sentinel region
genes_sroi <- subsetByOverlaps(gene_annot, sroi, ignore.strand=T)

# get genes near our cpg regions
genes_by_cpg <- get.nearby.ranges(croi, promoters(gene_annot))
names(genes_by_cpg) <- names(croi)
# get original ranges (not promoters)
genes_by_cpg <- lapply(names(genes_by_cpg), function(cg) {
  gs <- genes_by_cpg[[cg]]
  gene_annot[gs$hit_idx]
})
names(genes_by_cpg) <- names(croi)

# get single list of all cpg genes
genes_croi <- unique(unlist(GRangesList(unlist(genes_by_cpg))))

# ------------------------------------------------------------------------------
print("Collecting TFs and shortest path genes.")
# ------------------------------------------------------------------------------

tfs <- NULL
sp <- NULL

# get cpg ids and SNP gene symbols
cpgs <- names(croi)
snp_genes <- unique(genes_sroi$SYMBOL)

# modify ppi_db to contain our CpGs

# load the cpg-tf context
tfbs_ann <- get_tfbs_context(names(croi), fcpgcontext)
cpgs_with_tfbs <- cpgs[cpgs %in% rownames(tfbs_ann[rowSums(tfbs_ann)>0,])]
snp_genes_in_string <- snp_genes[snp_genes %in% nodes(ppi_db)]

# get locus graph
locus_graph <- add.to.graphs(list(ppi_db), sentinel, snp_genes,
                             cpgs_with_tfbs, tfbs_ann)[[1]]

# get tfs connected to cpgs
tf_syms = unique(unlist(adj(locus_graph, cpgs_with_tfbs)))
print(paste0("Annotated TFs: ", paste(tf_syms, collapse=", ")))

if(length(tf_syms) < 1 | length(snp_genes_in_string) < 1) {
  warning(paste0("No TFs or none of the SNP genes are in PPI DB. ",
                 "Skipping shortest paths calculation."))
  # still, we want to keep the available TFs if there are no SNP genes 
  # within the PPI DB (would get adjusted using shortest paths below)
  if(length(snp_genes_in_string) >= 1) {
    tfs <- gene_annot[gene_annot$SYMBOL %in% tf_syms]
  }

} else {
  # the nodes we want to keep
  # in the original meQTL paper we removed KAP1 from the TF symbols
  nodeset <- c(nodes(ppi_db), tf_syms,
               snp_genes_in_string, cpgs_with_tfbs)
  locus_graph <- subGraph(intersect(nodes(locus_graph), nodeset), locus_graph)

  shortest_paths <- get_shortest_paths(cis = cpgs_with_tfbs,
                                       trans=unique(c(snp_genes_in_string,
                                                   tf_syms)),
                           snp_genes=snp_genes_in_string,
                           locus_graph,
                           tf_syms,
                           best_trans)

  non_tf_sp <- shortest_paths$non_tf_sp
  tf_sp <- shortest_paths$tf_sp

  if(length(non_tf_sp) < 1){
    warning("No shortest path genes.")
  } else {
    sp <- gene_annot[gene_annot$SYMBOL %in% non_tf_sp]
  }
  if(length(tf_sp) < 1) {
    # This should not happen -> sanity check
    stop("No TFs on shortest paths!")
  } else {
    tfs <- gene_annot[gene_annot$SYMBOL %in% tf_sp]
  }
}
print(paste0("Annotated TFs after shortest path calculations: ", 
             paste(tfs$SYMBOL, collapse=", ")))

# ------------------------------------------------------------------------------
print("Finalizing and saving results.")
# ------------------------------------------------------------------------------
result <-  list(cpgs=croi,sentinel=sentinel_range,
                sentinel_ext_range=sentinel_extrange,
                snp_genes=genes_sroi, cpg_genes=genes_croi,
                cpg_genes_by_cpg=genes_by_cpg)

if(!is.null(sp)){
  result$spath <- sp
}
if(!is.null(tfs)){
  result$tfs <- tfs
}
# set seed name
result$seed <- "meqtl"

saveRDS(file=ofile, result)

# ------------------------------------------------------------------------------
print("SessionInfo:")
# ------------------------------------------------------------------------------
sessionInfo()
1
2
3
4
5
6
7
print("Installing libraries to:")
.libPaths()

source("https://bioconductor.org/biocLite.R")
biocLite(c("GeneNet", "GENIE3", "glasso", "BDgraph"))

install.packages("packages/iRafNet_1.1-2.tar.gz", repos=NULL, type="source")
R From line 1 of scripts/config_R.R
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
print("Get snakemake params")
# ------------------------------------------------------------------------------
fin <- snakemake@input[[1]]
fout <- snakemake@output[[1]]

# ------------------------------------------------------------------------------
print("Load and save to new format")
# ------------------------------------------------------------------------------
load(fin)
saveRDS(tfbs.ann, fout)

# ------------------------------------------------------------------------------
print("SessionInfo:")
# ------------------------------------------------------------------------------
sessionInfo()
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
print("Load libraries and source scripts")
# ------------------------------------------------------------------------------
library(tidyverse)
source("scripts/biomaRt.R")

# ------------------------------------------------------------------------------
print("Processing.")
# ------------------------------------------------------------------------------
fgwas <- snakemake@input$gwas
fout_snp_locs <- snakemake@output$snp_locs
fout_snp_pvalues <- snakemake@output$snp_pvalues

# load GWAS data and Position data for each SNP in there
gwas <- read_tsv(fgwas)
snp_pos <- get_snpPos_biomart(gwas$MarkerName)

# merge GWAS with position data, prepare dataframe for BIM file output
gwas_with_pos <- left_join(gwas, snp_pos, by=c("MarkerName" = "snp")) %>% 
  filter(!is.na(chr)) %>% 
  mutate(filler=0, A1=toupper(Allele1), A2=toupper(Allele2)) %>% 
  dplyr::select(chr, snp=MarkerName, filler, start, A1, A2, `P-value`) %>% 
  filter(!grepl("^H", chr))

# write bim output format
write_tsv(gwas_with_pos %>% dplyr::select(-`P-value`), 
          fout_snp_locs,
          col_names=F)

# write P-value file
write_tsv(gwas_with_pos %>% dplyr::select(snp, `P-value`), 
          fout_snp_pvalues,
          col_names=T)

# ------------------------------------------------------------------------------
print("SessionInfo:")
# ------------------------------------------------------------------------------
sessionInfo()
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

# ------------------------------------------------------------------------------
print("Load libraries and source scripts")
# ------------------------------------------------------------------------------
source("scripts/lib.R")
source("scripts/reg_net.R")
source("scripts/reg_net_utils.R")

# ------------------------------------------------------------------------------
print("Get snakemake params.")
# ------------------------------------------------------------------------------

fdata_kora <- snakemake@input$kora
fdata_lolipop <- snakemake@input$lolipop
foutput <- snakemake@output$result

# ------------------------------------------------------------------------------
print("Process data.")
# ------------------------------------------------------------------------------

remove_all_na <- function(data) {
  # remove (rare) all-NA cases. This can happen due to scaling of all-zero entities,
  # which can arise due to a very large number of cis-meQTLs which effects get
  # removed from the CpGs during data preprocessing.
  # NOTE: we could possibly handle this differently? Seems that these particular
  # cpgs are highly influenced by genetic effects?
  use <- apply(data, 2, function(x)
    (sum(is.na(x)) / length(x)) < 1)
  data <- data[, use]
  data
}

data_kora <- remove_all_na(readRDS(fdata_kora))
data_lolipop <- remove_all_na(readRDS(fdata_lolipop))

print("Get KORA correlation graph...")
correlation_fit_kora <-
  reg_net(data_kora, NULL, "correlation")
print(correlation_fit_kora$graph)

print("Done.\nGet LOLIPOP correlation graph...")
correlation_fit_lolipop <-
  reg_net(data_lolipop, NULL, "correlation")
print(correlation_fit_lolipop$graph)
print("Done.")

result <- list(kora = correlation_fit_kora,
               lolipop = correlation_fit_lolipop)

readr::write_rds(result, foutput)

# ------------------------------------------------------------------------------
print("SessionInfo:")
# ------------------------------------------------------------------------------
sessionInfo()
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
f_cosmo <- snakemake@input[[1]]
of_trans <- snakemake@output[["trans"]]
of_cis <- snakemake@output[["cis"]]
of_lr <- snakemake@output[["longrange"]]

cat("Loading cosmo file.\n")
load(f_cosmo)

# threshold for long-range: 1MB distance between cpg and snp
cat("Defining cis/longrange/trans.\n")
cis <- which((cosmo$snp.chr == cosmo$cpg.chr) & (abs(cosmo$snp.pos - cosmo$cpg.pos)<=1e6))
cosmo.cis <- cosmo[cis,]
longrange <- which((cosmo$snp.chr == cosmo$cpg.chr) & (abs(cosmo$snp.pos - cosmo$cpg.pos)>1e6))
cosmo.longrange <- cosmo[longrange,]
trans <- which(cosmo$snp.chr != cosmo$cpg.chr)
cosmo.trans <- cosmo[trans,]
rm(cosmo)

cat("Number of cis-associations: ", length(cis), "\n")
cat("Number of longrange-associations: ", length(longrange), "\n")
cat("Number of trans-associations: ", length(trans), "\n")

cat("Saving new cosmo splits.\n")

cosmo <- cosmo.cis
saveRDS(file=of_cis, cosmo)
cosmo <- cosmo.longrange
saveRDS(file=of_lr, cosmo)
cosmo <- cosmo.trans
saveRDS(file=of_trans, cosmo)
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
print("Loading libraries and sourcing scripts.")
# ------------------------------------------------------------------------------
suppressPackageStartupMessages(library(GenomicRanges))
library(ggplot2)
source("scripts/lib.R")

cols <- set_defaultcolors()
sfm <- scale_fill_manual(values=cols)
theme_set(theme_bw())

# ------------------------------------------------------------------------------
print("Getting snakemake params.")
# ------------------------------------------------------------------------------
finputs <- snakemake@input
fout <- snakemake@output[[1]]

# ------------------------------------------------------------------------------
print("Loading data and creating data-frame for plotting.")
# ------------------------------------------------------------------------------
data <- lapply(finputs, function(fin) {
  # load data
  data <- readRDS(fin)

  # for now just report the number of identified entities
  ncol(data)
})

data <- do.call(rbind.data.frame, args=c(data, stringsAsFactors=F))
colnames(data) <- c("entities")
data$seed <- ifelse(grepl("eqtlgen", finputs), "eqtlgen", "meqtl")

# convert back to numeric..
data$entities <- as.numeric(data$entities)

# ------------------------------------------------------------------------------
print("Plotting and saving results.")
# ------------------------------------------------------------------------------
pdf(fout)
ggplot(data, aes(x=entities)) + geom_histogram(stat="count") +
  xlab("number of variables") + ggtitle("Histogram of the number of
                                                     variables over all loci
                                                     for meQTL and eQTLgen.") +
  facet_wrap(~seed, ncol=2)
dev.off()

# ------------------------------------------------------------------------------
print("Session info:")
# ------------------------------------------------------------------------------
sessionInfo()
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
print("Loading libraries and sourcing scripts.")
# ------------------------------------------------------------------------------
library(GenomicRanges)
library(ggplot2)
library(reshape2)
library(cowplot)

source("scripts/lib.R")

cols <- get_defaultcolors()
sfm <- scale_fill_manual(values=cols)
theme_set(theme_cowplot())

# ------------------------------------------------------------------------------
print("Getting snakemake params.")
# ------------------------------------------------------------------------------
# input
finputs <- snakemake@input

# check seed group: eqtl or meqtl based
seeds <- ifelse(grepl("eqtlgen", finputs), "eqtl", "meqtl")

# output
fout <- snakemake@output[[1]]

# ------------------------------------------------------------------------------
print("Loading data and creating data-frame for plotting.")
# ------------------------------------------------------------------------------
data <- lapply(finputs, function(fin) {
  # load ranges
  ranges <- readRDS(fin)

  # get sentinel name
  sentinel <- names(ranges$sentinel)

  # get number of entities
  sg <- length(ranges$snp_genes)
  tfs <- length(ranges$tfs)
  sp <- length(ranges$spath)

  seed <- ranges$seed

  if(seed == "meqtl"){
    trans_assoc <- length(ranges$cpgs)
    cpg_genes <- length(ranges$cpg_genes)
  } else {
    trans_assoc <- length(ranges$trans_genes)
    cpg_genes <- NA
  }

  # get the grand total of entities
  total <- sg + tfs + sp + trans_assoc + ifelse(is.na(cpg_genes), 0, cpg_genes)

  if(seed == "meqtl") {
    c(sentinel, sg, tfs, sp, trans_assoc, cpg_genes, total, seed)
  } else {
    c(sentinel, sg, tfs, sp, trans_assoc, 0, total, seed)
  }

})

data <- do.call(rbind.data.frame, args=c(data, stringsAsFactors=F))

colnames(data) <- c("snp","snp_genes","TFs","shortest_path",
                      "trans_entities", "cpg_genes", "total", "seed")

# convert back to numeric..
data$snp_genes <- as.numeric(data$snp_genes)
data$trans_entities <- as.numeric(data$trans_entities)
data$TFs <- as.numeric(data$TFs)
data$shortest_path <- as.numeric(data$shortest_path)
data$cpg_genes <- as.numeric(data$cpg_genes)
data$total <- as.numeric(data$total)

# get fractions as well
data_fractions <- data
data_fractions$snp_genes <- data$snp_genes / data$total
data_fractions$trans_entities <- data$trans_entities / data$total
data_fractions$TFs <- data$TFs / data$total
data_fractions$shortest_path <- data$shortest_path / data$total
data_fractions$cpg_genes <- data$cpg_genes / data$total

# remove totals
data$total <- NULL
data_fractions$total <- NULL

# melt data frames for use in ggplot
melted <- melt(data)
melted_fractions <- melt(data_fractions)

# ------------------------------------------------------------------------------
print("Plotting and saving results.")
# ------------------------------------------------------------------------------

gt <- ggtitle(paste0("Overview on number of entities gathered for\n",
                     length(finputs), " trans loci."))

fw <- facet_wrap( ~ seed, ncol=2)
vert_labels <- theme(axis.text.x = element_text(angle = 90, hjust = 1))

# violin plots containing points/lines showing distributions
gp <- ggplot(aes(y=value, x=variable, fill=variable), data=melted) +
	geom_violin(draw_quantiles = c(0.25, 0.5, 0.75)) +
	sfm + gt + fw + vert_labels
gp1 <- ggplot(aes(y=value, x=variable, fill=variable), data=melted_fractions) +
  geom_violin(draw_quantiles = c(0.25, 0.5, 0.75)) +
  sfm + gt + fw + vert_labels
gp2 <- gp + geom_line(aes(group=snp)) + fw
# histograms of individual entity types
gp3 <- ggplot(aes(x=value, fill=variable), data=melted) +
  geom_histogram(stat="count") + facet_grid(seed ~ variable) + sfm

# barplot over all loci
gp4 <- ggplot(aes(y=value, x=snp, fill=variable), data=melted) +
  geom_bar(stat="identity", position="dodge") + sfm +
  vert_labels + fw

pdf(fout, width=10, height=8)
gp
gp1
gp2
gp3
gp4

# finally, plot the number of entities per locus
# TODO there must be a better way for this...
eqtl <- melted[melted$seed == "eqtl",]
meqtl <- melted[melted$seed == "meqtl",]
sum_per_locus_meqtl <- tapply(meqtl$value, meqtl$snp, sum)
sum_per_locus_meqtl <- cbind.data.frame(snp=names(sum_per_locus_meqtl),
                                       count=sum_per_locus_meqtl,
                                       stringsAsFactors=F)
sum_per_locus_eqtl <- tapply(eqtl$value, eqtl$snp, sum)
sum_per_locus_eqtl <- cbind.data.frame(snp=names(sum_per_locus_eqtl),
                                       count=sum_per_locus_eqtl,
                                       stringsAsFactors=F)

sum_per_locus <- rbind(sum_per_locus_meqtl, sum_per_locus_eqtl)
sum_per_locus$seed <- c(rep("meqtl", nrow(sum_per_locus_meqtl)),
                        rep("eqtl", nrow(sum_per_locus_eqtl)))
gp5 <- ggplot(aes(y=count, x="all loci", fill="all loci"),
              data = sum_per_locus) +
  geom_violin(draw_quantiles = c(0.25, 0.5, 0.75)) +
  xlab("") + ylab("Number of entities") +
  ggtitle("Total number of entities for all available loci.") +
  scale_fill_manual(values=cols, guide=F) + facet_wrap( ~ seed, ncol=2)
gp5

dev.off()

# ------------------------------------------------------------------------------
print("Session info:")
# ------------------------------------------------------------------------------
sessionInfo()
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
print("Load libraries and source scripts")
# ------------------------------------------------------------------------------
library(data.table)
library(graph)
library(igraph)
library(dplyr)
source("scripts/lib.R")

# ------------------------------------------------------------------------------
print("Get snakemake params.")
# ------------------------------------------------------------------------------
fstring <- snakemake@input$stringdb
fbiogrid <- snakemake@input$biogrid
fhuri <- snakemake@input$huri

fgene_annot <- snakemake@input$gene_annot

# gtex gene expression data (we filter genes for the ones expressed in blood)
fgtex <- snakemake@input$gtex

fout <- snakemake@output[[1]]

ppi_name <- snakemake@params$ppi_name

# ------------------------------------------------------------------------------
print(paste0("Loading PPI database: ", ppi_name))
# ------------------------------------------------------------------------------

ga <- load_gene_annotation(fgene_annot)

ga_tibble <- ga %>% as.data.frame %>% as_tibble(rownames = "ENSG") %>%
  mutate(ENSG_gene = gsub("\\..*", "", ENSG)) %>%
  select(ENSG_gene, SYMBOL)

if(ppi_name == "string") {
  string.all <- fread(fstring,
                      data.table=F, header=T, stringsAsFactors=F)
  string.inter <- string.all[string.all$experimental>=1 | string.all$database>=1,]

  string.nodes <- unique(c(string.inter[,1], string.inter[,2]))
  print("Creating interaction graph.")
  ppi_db <- graphNEL(nodes=string.nodes)
  ppi_db <- addEdge(string.inter[,1],
                    string.inter[,2],
                    ppi_db)
} else if (grepl("biogrid", ppi_name)) {
  # biogrid database
  biogrid <- fread(fbiogrid, stringsAsFactors=F)
  if(grepl("stringent", ppi_name)) {
    print("Making PPIs more stringent.")
    biogrid <- biogrid %>%
      filter(grepl("Low Throughput", Throughput)) %>%
      filter(`Experimental System Type` == "physical") %>%
      as_tibble()
  } else {
    biogrid <- biogrid %>% as_tibble()
  }

  print("Creating interaction graph.")
  nodes <- unique(c(biogrid$`Official Symbol Interactor A`,
                  biogrid$`Official Symbol Interactor B`))
  ppi_db <- graphNEL(nodes=nodes)
  ppi_db <- addEdge(biogrid$`Official Symbol Interactor A`,
                    biogrid$`Official Symbol Interactor B`,
                    ppi_db)
} else if (ppi_name == "huri"){
  huri <- readr::read_tsv(fhuri, col_names = c("gene1", "gene2")) %>% 
    left_join(ga_tibble, by=c("gene1" = "ENSG_gene")) %>% 
    left_join(ga_tibble, by = c("gene2" = "ENSG_gene")) %>% 
    select(gene1 = SYMBOL.x, gene2 = SYMBOL.y) %>% 
    tidyr::drop_na()
  nodes <- unique(c(pull(huri, gene1), pull(huri, gene2)))
  ppi_db <- graphNEL(nodes = nodes)
  ppi_db <- addEdge(huri$gene1, huri$gene2, ppi_db)
} else {
  stop(paste0("PPI db not supported:", ppi_name))
}

# ------------------------------------------------------------------------------
print("Filtering PPI for expressed genes.")
# ------------------------------------------------------------------------------
ppi_db_expr <- filter_expression(fgtex, ga, ppi_db)

# ------------------------------------------------------------------------------
print("Getting largest connected component.")
# ------------------------------------------------------------------------------
ppi_db_final <- get_largest_cc(ppi_db_expr)


print("Largest CC in PPI_DB filtered for blood expressed genes:")
print(ppi_db_final)

# ------------------------------------------------------------------------------
print("All done. Saving output.")
# ------------------------------------------------------------------------------
saveRDS(ppi_db_final, fout)

# ------------------------------------------------------------------------------
print("SessionInfo:")
# ------------------------------------------------------------------------------
sessionInfo()
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

# ------------------------------------------------------------------------------
# Load libraries and source scripts
# ------------------------------------------------------------------------------
library(data.table)
library(fdrtool)
source("scripts/priors.R")

# ------------------------------------------------------------------------------
# Get snakemake params
# ------------------------------------------------------------------------------

# inputs
feqtl <- snakemake@input[["eqtl"]]
dplots <- snakemake@params$plot_dir

# outputs
fout_eqtl_priors <- snakemake@output$eqtl_priors

# ------------------------------------------------------------------------------
print("Start processing.")
# ------------------------------------------------------------------------------
all_priors <- create_eqtlgen_eqtl_priors(feqtl)

# ------------------------------------------------------------------------------
print("Processing done, saving priors.")
# ------------------------------------------------------------------------------
saveRDS(all_priors, fout_eqtl_priors)

# ------------------------------------------------------------------------------
print("Session info:")
# ------------------------------------------------------------------------------
sessionInfo()
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

# ------------------------------------------------------------------------------
# Load libraries and source scripts
# ------------------------------------------------------------------------------
suppressPackageStartupMessages(library(qvalue))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(graph))
suppressPackageStartupMessages(library(parallel))
suppressPackageStartupMessages(library(fdrtool))
suppressPackageStartupMessages(library(Homo.sapiens))
suppressPackageStartupMessages(library(rtracklayer))
suppressPackageStartupMessages(library(FDb.InfiniumMethylation.hg19))
source("scripts/lib.R")
source("scripts/priors.R")

# ------------------------------------------------------------------------------
# Get snakemake params
# ------------------------------------------------------------------------------

# inputs
feqtl <- snakemake@input[["eqtl"]]
fsnpinfo <- snakemake@input[["snpinfo"]]
fexpr <- snakemake@input[["expr"]]
fsampleinfo <- snakemake@input[["sampleinfo"]]
fpheno <- snakemake@input[["pheno"]]
fppi <- snakemake@input[["ppi"]]
dplots <- snakemake@params$plot_dir

# outputs
fout_gene_priors <- snakemake@output$gene_priors
fout_eqtl_priors <- snakemake@output$eqtl_priors

# ------------------------------------------------------------------------------
# Start processing
# ------------------------------------------------------------------------------

print("Loading PPI db.")
ppi_db <- readRDS(fppi)

# simply delegate
create_priors(
  feqtl,
  fsnpinfo,
  fexpr,
  fsampleinfo,
  fpheno,
  dplots,
  ppi_db,
  fout_gene_priors,
  fout_eqtl_priors
)

if (FALSE) {
  # ------------------------------------------------------------------------------
  print("Prepare the Banovich based priors, i.e. TF-CpG priors.")
  # ------------------------------------------------------------------------------
  # methylation data
  meth <-
    fread("data/current/banovich-2017/methylation/full_matrix.txt",
          data.table = F)
  rownames(meth) <- meth$V1
  meth$V1 <- NULL
  cpgs <- features(FDb.InfiniumMethylation.hg19)
  cpgs <- cpgs[rownames(meth)]

  # expression data
  expr <-
    read.table(
      "data/current/banovich-2017/xun_lan/allTFexp.withHeader",
      header = T,
      sep = "\t",
      stringsAsFactors = F
    )

  # apparently the table contains duplicated entries, remove them
  expr <- expr[!duplicated(expr), ]
  rownames(expr) <- unique(expr[, 1])

  samples <- intersect(colnames(expr), colnames(meth))

  expr <- t(expr[, samples])
  meth <- t(meth[, samples])

  # ------------------------------------------------------------------------------
  print("Get (our) chip-seq context for the cpgs.")
  # ------------------------------------------------------------------------------
  tfbs_ann <- get_tfbs_context(names(cpgs), fcpgcontext)

  # ------------------------------------------------------------------------------
  print("For each TF, get the correlation to each of the CpGs it is bound nearby")
  # ------------------------------------------------------------------------------
  pairs <- lapply(colnames(expr), function(tf) {
    # get columns for tf
    sub <-
      tfbs_ann[, grepl(tf, colnames(tfbs_ann), ignore.case = T), drop = F]
    rs <- rowSums(sub)
    bound_cpgs <- names(rs[rs > 0])

    assoc <- unlist(mclapply(bound_cpgs, function(c) {
      cor.test(expr[, tf],
               meth[, c],
               method = "pearson")$p.value
    }, mc.cores = threads))

    cbind.data.frame(
      TF = rep(tf, length(assoc)),
      CpG = bound_cpgs,
      rho = assoc,
      stringsAsFactors = F
    )
  })

  # ------------------------------------------------------------------------------
  print("Collect and finalize results.")
  # ------------------------------------------------------------------------------
  tab <- do.call(rbind, pairs)
  colnames(tab) <- c("TF", "CpG", "pval")
  tab$qval <- qvalue(tab$pval)$lfdr
  tab$prior <- 1 - tab$qval
  head(tab)
  write.table(
    file = "results/current/tf-cpg-prior.txt",
    sep = "\t",
    col.names = NA,
    row.names = T,
    quote = F,
    tab
  )
}
# ------------------------------------------------------------------------------
print("Session info:")
# ------------------------------------------------------------------------------
sessionInfo()
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

#------------------------------------------------------------------------------
print("Loading libraries and scripts.")
#------------------------------------------------------------------------------
library(graph)
library(igraph)
source("scripts/reg_net.R")
source("scripts/lib.R")
source("scripts/reg_net_utils.R")

#------------------------------------------------------------------------------
print("Getting snakemake params.")
#------------------------------------------------------------------------------

# get in and output
ffit_kora <- snakemake@input$new_kora
ffit_lolipop <- snakemake@input$new_lolipop

franges <- snakemake@input$ranges
fppi_db <- snakemake@input$ppi_db
fcpg_context <- snakemake@input$cpg_context
ftss_context <- snakemake@input$tss_context

# the output dot file and the combined graph
fout_dot <- snakemake@output$dot
fout_graph <- snakemake@output$graph

# get wildcards
graph_type <- snakemake@wildcards$graph_type
sentinel <- snakemake@wildcards$sentinel
seed <- snakemake@wildcards$seed

# use CpG context for meQTLs only
if(seed == "meqtl") {
  fcontext <- fcpg_context
} else {
  fcontext <- ftss_context
}

# define available graph types
gtypes <- c("bdgraph", "bdgraph_no_priors", "genenet", "irafnet",
            "glasso", "glasso_no_priors", "genie3")
if(!graph_type %in% gtypes) {
  stop(paste0("Graph type not supported: ", graph_type))
}
print("Using graph type:")
print(graph_type)

#-------------------------------------------------------------------------------
print("Loading data.")
#-------------------------------------------------------------------------------
ranges <- readRDS(franges)
ppi_db <- readRDS(fppi_db)

# we regenerated fits for glasso and genie3 without calculating all others
# we use 'old fits' for all other models
#if(graph_type %in% c("glasso", "glasso_no_priors", "genie3")) {
  fit_kora <- readRDS(ffit_kora)
  fit_lolipop <- readRDS(ffit_lolipop)
#} else {
#  fit_kora <- readRDS(ffit_kora_old)
#  fit_lolipop <- readRDS(ffit_lolipop_old)
#}
if(!graph_type %in% names(fit_kora) | !graph_type %in% names(fit_lolipop)) {
  stop("Graph type not available in fitting results.")
}

g_kora <- fit_kora[[graph_type]]
g_lolipop <- fit_lolipop[[graph_type]]

print("Loaded graphs:")
print("KORA:")
print(g_kora)
print("LOLIPOP:")
print(g_lolipop)

# ------------------------------------------------------------------------------
print("Combining individual cohort graphs.")
# ------------------------------------------------------------------------------
g_combined <- combine_graphs(g_kora, g_lolipop)

# filter for >0 node degrees
ds <- graph::degree(g_combined)
use <- names(ds[ds>0])
final_graph <- subGraph(use, g_combined)
final_graph_annotated <- annotate.graph(final_graph, ranges, ppi_db,
                                        fcontext = fcontext)

print("Final graph:")
print(final_graph_annotated)

#------------------------------------------------------------------------------
print("Saving output files.")
#------------------------------------------------------------------------------
attrs <- plot_ggm(final_graph_annotated, sentinel,
                  graph.title=paste0(c(sentinel, graph_type),
                                                  collapse="|"),
                  plot.on.device=F, dot.out=fout_dot)
saveRDS(file = fout_graph, final_graph_annotated)

# ------------------------------------------------------------------------------
print("SessionInfo:")
# ------------------------------------------------------------------------------
sessionInfo()
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

#------------------------------------------------------------------------------
print("Loading libraries and scripts.")
#------------------------------------------------------------------------------
library(graph)
source("scripts/lib.R")

#------------------------------------------------------------------------------
print("Getting snakemake params.")
#------------------------------------------------------------------------------

# get in and output
ffit <- snakemake@input$fits
#ffit_old <- snakemake@input$old

fout <- snakemake@output[[1]]

# get wildcards
graph_type <- snakemake@wildcards$graph_type
sentinel <- snakemake@wildcards$sentinel
cohort <- snakemake@wildcards$cohort
if(is.null(cohort)) cohort <- "GTEx"

# define available graph types
gtypes <- c("bdgraph", "bdgraph_no_priors", "genenet", "irafnet", 
            "glasso", "glasso_no_priors", "genie3")

if(!graph_type %in% gtypes) {
  stop(paste0("Graph type not supported: ", graph_type))
}
print("Using graph type:")
print(graph_type)

#------------------------------------------------------------------------------
print("Loading data.")
#------------------------------------------------------------------------------
#if(graph_type %in% c("glasso", "glasso_no_priors", "genie3")) {
  fits <- readRDS(ffit)
#} else {
#  fits <- readRDS(ffit_old)
#}
g <- fits[[graph_type]]

print("Loaded graph:")
print(g)

#------------------------------------------------------------------------------
print("Creating the dot file.")
#------------------------------------------------------------------------------
attrs <- plot_ggm(g, sentinel, graph.title=paste0(c(sentinel, cohort, graph_type), 
                                                  collapse="|"), 
                  plot.on.device=F, dot.out=fout)
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

# ------------------------------------------------------------------------------
print("Load libraries and source scripts")
# ------------------------------------------------------------------------------
library(GenomicRanges)
source("scripts/lib.R")
source("scripts/mediation_methods.R")

# ------------------------------------------------------------------------------
print("Get snakemake params")
# ------------------------------------------------------------------------------
# inputs
fdata <- snakemake@input$data
franges <- snakemake@input$ranges

# params
sentinel <- snakemake@wildcards$sentinel

# outputs
fbetas_per_gene_plot <- snakemake@output$betas_per_gene
fbeta_table <- snakemake@output$beta_table
fout <- snakemake@output$mediation

# ------------------------------------------------------------------------------
print("Loading and preparing data.")
# ------------------------------------------------------------------------------
data <- readRDS(fdata)
# remove (rare) all-NA cases. This can happen due to scaling of all-zero entities,
# which can arise due to a very large number of cis-meQTLs which effects get
# removed from the CpGs during data preprocessing.
# NOTE: we could possibly handle this differently? Seems that these particular
# cpgs are highly influenced by genetic effects?
use <- apply(data,2,function(x) (sum(is.na(x)) / length(x)) < 1)
data <- data[,use]

ranges <- readRDS(franges)

# the snp genes
sgenes <- ranges$snp_genes$SYMBOL
sgenes <- sgenes[sgenes %in% colnames(data)]
if(length(sgenes) == 0) {
  warning("No SNP genes in data matrix.")
  saveRDS(file=fout, NULL)
  q()
}
# the trans associated entities
if(ranges$seed == "meqtl") {
  ta <- names(ranges$cpgs)
} else {
  ta <- ranges$trans_genes$SYMBOL
}
ta <- ta[ta %in% colnames(data)]
print("Number of trans entities:")
print(length(ta))

# ------------------------------------------------------------------------------
print("Performing mediation analysis.")
# ------------------------------------------------------------------------------
med <- mediation(data, sentinel, sgenes, ta,
                  fbeta_table, fbetas_per_gene_plot)

# ------------------------------------------------------------------------------
print("Saving results.")
# ------------------------------------------------------------------------------
saveRDS(file=fout, med)

# ------------------------------------------------------------------------------
print("SessionInfo:")
# ------------------------------------------------------------------------------
sessionInfo()
sink()
sink(type="message")
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
library(ggplot2)
library(reshape2)
library(scales)
library(knitr)
library(gridExtra)
library(graph)
library(cowplot)
library(patchwork)
library(data.table)

source("scripts/lib.R")
cols <- set_defaultcolors()

# prepare some ggplot stuff
sfm <- scale_fill_manual(values=cols)
#theme_set(theme_bw())

# ------------------------------------------------------------------------------
# Get snakemake params if available
# ------------------------------------------------------------------------------

# inputs
fsummary <- snakemake@input[[1]]

# outputs
fstats <- snakemake@output$stats
fcratios <- snakemake@output$cratios
fexpr <- snakemake@output$expr
fgene_types <- snakemake@output$gene_types
fmediation <- snakemake@output$mediation
fmediation_perc <- snakemake@output$mediation_perc
fmediation_distr <- snakemake@output$mediation_distr
fperf <- snakemake@output$perf

# params
seed <- snakemake@wildcards$seed
isMEQTL <- seed == "meqtl"

# ------------------------------------------------------------------------------
# load the large result table for all loci
# TODO we currently remove the results for the GO enrichment
# ------------------------------------------------------------------------------
tab <- fread(fsummary)

# ------------------------------------------------------------------------------
# create some basic plots of the gene counts
# ------------------------------------------------------------------------------
toplot <- tab[,c("sentinel", "cohort", "graph_type",
                 "number_nodes", "number_edges",
                 "graph_density", "cluster", "cluster_sizes",
                 "snp_cluster",
                 "snp_genes", "snp_genes_selected",
                 "cpg_genes", "cpg_genes_selected",
                 "tfs", "tfs_selected",
                 "spath", "spath_selected")]
toplot$snp_in_network <- !is.na(toplot[,"snp_cluster"])

# whether the SNP has been selected or not
gp1 <- ggplot(data=toplot, aes(snp_in_network)) + geom_histogram(stat="count") +
  facet_grid(cohort ~ graph_type) + sfm +
  ggtitle("Number of networks in which the SNP has been selected at all")

# show the distribution of number of nodes per network
gp2 <- ggplot(data=toplot, aes(number_nodes)) + geom_histogram() +
  facet_grid(cohort ~ graph_type) + sfm +
  ggtitle("Distribution of the number of nodes in the networks")

# show the distribution of number of edges per network
gp3 <- ggplot(data=toplot, aes(number_edges)) + geom_histogram() +
  facet_grid(cohort ~ graph_type) + sfm +
  ggtitle("Distribution of the number of edges in the graph")

# show the distribution of graph_densities network
gp4 <- ggplot(data=toplot, aes(graph_density)) + geom_histogram() +
  facet_grid(cohort ~ graph_type) + sfm +
  ggtitle("Distribution of graph densities over all networks", "density= 2*|E| / |V|*(|V|-1)")

# show the distribution of resulting clusters (number of clusters, largest cluster size)
cluster_ratios <- c()
for(i in 1:nrow(toplot)) {
  # get cluster ratio
  cluster_sizes <- unlist(toplot[i,"cluster_sizes"])
  largest_cluster <- sort(as.numeric(strsplit(cluster_sizes, ",")[[1]]), decreasing = T)[1]
  cluster_ratios <- c(cluster_ratios,
                      largest_cluster/toplot[i,"number_nodes"])
}
toplot$cluster_ratio <- unlist(cluster_ratios)

# summarize the first four plots in one file
ggsave(plot=(gp1 + gp2) / (gp3 + gp4),
       file=fstats,
       width=12,
       height=8)

# save the cluster ratios plot individually
gp <- ggplot(data=toplot, aes(cluster_ratio)) + geom_histogram() +
  facet_grid(cohort ~ graph_type) + sfm +
  ggtitle("Ratio of the amount of nodes in the largest clusters vs all nodes in the network")
ggsave(plot=gp,
       file=fcratios,
       width=10, height=10)

# ------------------------------------------------------------------------------
# Check how many genes are retained in the graphs for each gene_type
# ------------------------------------------------------------------------------

# get ratios
toplot$snp_gene_ratio <- toplot$snp_genes_selected / toplot$snp_genes
toplot$cpg_gene_ratio <- toplot$cpg_genes_selected / toplot$cpg_genes
toplot$tf_ratio <- toplot$tfs_selected / toplot$tfs
toplot$spath_ratio <- toplot$spath_selected / toplot$spath

# ------------------------------------------------------------------------------
# Plot everything
# ------------------------------------------------------------------------------

# snp gene ratio
use <- toplot$snp_genes_selected>0
ggp1 <- ggplot(data=toplot[use,,drop=F], aes(snp_gene_ratio)) + geom_histogram() +
  facet_grid(cohort ~ graph_type) + sfm +
  ggtitle("Ratio of number of selected SNP-genes vs all SNP-genes")

if(isMEQTL) {
  # cpg gene ratio
  use <- toplot$cpg_genes_selected>0
  ggp2 <- ggplot(data=toplot[use,,drop=F], aes(cpg_gene_ratio)) + geom_histogram() +
    facet_grid(cohort ~ graph_type) + sfm +
    ggtitle("Ratio of number of selected CpG-genes vs all CpG-genes")
} else {
  ggp2 <- plot_spacer()
}

# tf ratio
use <- toplot$tfs_selected>0
ggp3 <- ggplot(data=toplot[use,,drop=F], aes(tf_ratio)) + geom_histogram() +
  facet_grid(cohort ~ graph_type) + sfm +
  ggtitle("Ratio of number of selected TFs vs all TFs")
# spath ratio
use <- toplot$spath_selected>0
ggp4 <- ggplot(data=toplot[use,,drop=F], aes(spath_ratio)) + geom_histogram() +
  facet_grid(cohort ~ graph_type) +sfm +
  ggtitle("Ratio of number of selected shortest path genes vs all shortest path genes")

# arrange and plot
ggsave(plot=(ggp1 + ggp2) / (ggp3 + ggp4),
       file=fgene_types,
       width=12, height=8)

#Above we see a simple overview over the selected entities (snp genes, cpg genes,
#transcription factors and shortest path genes).
#Shown is the log10-ratio of the number of entities selected by the GGM and the total
#number of entities in the network. Therefore, low values indicate that comparatively more genes
#were dropped during the inference process and only fewer were selected for the final network.

#Below the mediation results on the different cohorts for all hotspots
#in which at least one SNP gene was extracted by our algorithm are shown.
#The *mediation_selected* group shows the significance of the mediation analysis of
#those selected genes, the *mediation* group shows the significance of all NOT selected genes.
#Note, that for the 'mediation_selected' group, we always take the largest obtained p-value, whereas
#for the NOT selected group we always select the lowest obtained p-value as of now.

# ------------------------------------------------------------------------------
# Mediation summary plots
# ------------------------------------------------------------------------------
mediation <- tab[,c("sentinel", "cohort", "graph_type",
                    "mediation_min_pval_notselected", "mediation_max_pval_selected","log10_mediation_NSoverS_ratio")]
# for now we ignore results where we didn't have SNP genes at all
mediation <- mediation[!is.infinite(mediation$log10_mediation_NSoverS_ratio), ,drop=F]

# plot the mediation results
toplot <- mediation[order(mediation$mediation_max_pval_selected),]
toplot$sentinel <- factor(toplot$sentinel, levels=unique(toplot$sentinel))
toplot <- melt(toplot, measure.vars=c(4,5), value.name = "pval")

gp1 <- ggplot(data=toplot, aes(x=graph_type, y=-log10(pval), fill=variable)) +
  geom_boxplot(outlier.color = NA) +
  facet_grid(cohort ~ .) + sfm +
  geom_hline(yintercept = -log10(0.05), color="red") +
  ggtitle("Mediation results of all SNP genes over all loci.")

# in addition we plot the log fold-change
# sort by log_fc
toplot <- mediation[order(mediation$log10_mediation_NSoverS_ratio, decreasing = T),]
toplot$sentinel <- factor(toplot$sentinel, levels=unique(toplot$sentinel))

toplot <- melt(toplot, measure.vars=6, value.name="log_fc")
toplot <- toplot[toplot$log_fc != 0,]
toplot$favored <- factor(sign(toplot$log_fc),labels = c("not selected", "selected"))
gp2 <- ggplot(toplot, aes(x=sentinel, y=log_fc, fill=favored)) +
  geom_bar(stat="identity") + sfm +
  facet_grid(cohort ~ graph_type) +
  theme(axis.text.x =
          element_text(angle = 90, hjust = 1, vjust=0)) +
  ylab("log10(all / selected)") +
  ggtitle("Mediation results of all SNP genes, log-foldchanges.")

# get the two main mediation plots
gp1 <- ggplotGrob(gp1)
gp2 <- ggplotGrob(gp2)

# Cross cohort mediation summary plots
mediation <- tab[,c("sentinel", "cohort", "graph_type",
                    "mediation_cross_cohort_correlation", "mediation_cross_cohort_fraction",
		    "mediation_cross_cohort_fraction_validation_significant")]
# only choose one graph_type, since the information is redundant
df <- subset(mediation, graph_type="graph")
colnames(df) <- c("sentinel", "cohort", "graph_type", "corr",
		  "fraction", "fraction_validation")
df <- df[,c("sentinel", "cohort", "corr",
                  "fraction", "fraction_validation")]
df <- melt(df, measure.vars=c(3,4,5))
df$type <- ifelse(df$variable=="corr", "correlation", "fraction")

# plot
gv <- geom_violin(draw_quantiles=c(.25,.5,.75))
gp <- ggplot(data=df, aes(y=value, x=variable))
gp3 <- gp + gv + facet_grid(cohort ~ type) + sfm +
	ggtitle("Cross cohort evaluation of mediation values")
gp3 <- ggplotGrob(gp3)

ggsave(plot=grid.arrange(gp1, gp2, gp3, nrow=3),
       file=fmediation,
       width=8, height=20)

#This figure shows a different summary of the mediation results. Shown is the log10 fold change
#of the mediation p-value for the not selected SNP genes ($p_n$) over the p-value for selected
#genes ($p_s$). The red bars (*not selected*) indicate fold changes where $p_n$ is lower
#than the corresponding $p_s$, whereas the blue bars indicate the opposite. Overall,
#**`r format(perc*100,digits=4)`\%** of all fold changes show negative fold changes
#(i.e. $p_s$ being smaller than their respective $p_n$s). Currently, we select the
#minimal p-value obtained from the genes not selected via a GGM and the maximal p-value
#from the selected genes which is a rather conservative estimate. We could think about
#changing the way of summarizing the p-values...

# ------------------------------------------------------------------------------
# Mediation details
# Here we look at the percentage of significant mediation results of the GGM selected
# SNP genes versus total number of SNP genes. We also check whether we detected
# the single best mediation gene via the models
# ------------------------------------------------------------------------------

perc <- table(toplot$favored, toplot$graph_type)
perc <- perc["selected",] / colSums(perc)
print("Percentage of selected vs not selected mediating genes:")
print(perc)

# create data frame with all needed values
results <- tab[,c("graph_type", "cohort", "snp_genes", "snp_genes_selected",
		  "snp_genes_selected.list",
		  "mediation_best_gene", "mediation_total",
                  "mediation_notselected_significant", "mediation_selected_significant")]

# ------------------------------------------------------------------------------
# Here we prepare the information of whether the best mediating gene
# (according to the correlation value of the betas) was selected by the models
# ------------------------------------------------------------------------------
results$identified_mediator <- NA
for(i in 1:nrow(results)) {
  lab <- ifelse(grepl(results[i,"mediation_best_gene"],
		      results[i,"snp_genes_selected.list"]),
		"identified",
		"missing")
  results[i,"identified_mediator"] <- lab
}

# ------------------------------------------------------------------------------
# Here we visualize the distributions over all loci for the individual groups
# and other details concerning mediation
# ------------------------------------------------------------------------------

# get SNP gene percentages regarding mediation
results$sign_selected <- results[,"mediation_selected_significant"] / results[,"snp_genes_selected"]
results$snp_genes_notselected <- results[,"snp_genes"] - results[,"snp_genes_selected"]
results$sign_notselected <- results[,"mediation_notselected_significant"] / results[,"snp_genes_notselected"]

fg <- facet_grid(. ~ cohort)
sc <- scale_y_continuous(limits=c(0,1))
gg <- ggplot(data=results, aes(y=sign_selected, x=graph_type, fill=graph_type))

# plot how often we identified the mediator
gp <- gg + geom_bar(stat="count", inherit.aes=F, aes(x=identified_mediator)) +
	sfm + facet_grid(graph_type ~ cohort) +
	ggtitle("Number of times best mediator was selected in models." )

# plot distribution  of selected SNP genes showing mediation
gp1 <- gg + geom_violin(draw_quantiles=c(.25,.5,.75)) +
			sfm + fg + sc +
			ggtitle("Percentage of selected SNP-genes showing mediation.")

# plot distribution  of not-selected SNP genes showing mediation
gp2 <- gg + aes(y=sign_notselected) +
       geom_violin(draw_quantiles=c(.25,.5,.75)) +
       sfm + fg + sc +
       ggtitle("Percentage of not-selected SNP-genes showing mediation.")

# plot distribution of percentage of total number of mediating genes
gp3 <- gg +
       geom_violin(inherit.aes=F, mapping=aes(y=mediation_total/snp_genes, x=cohort, fill=cohort), draw_quantiles=c(.25,.5,.75)) +
       sfm + sc +
       ggtitle("Percentage of SNP-genes showing mediation.")

# plot distribution  of total number of mediating genes
gp4 <- gg +
       geom_violin(inherit.aes=F, mapping=aes(y=mediation_total, x=cohort, fill=cohort), draw_quantiles=c(.25,.5,.75)) +
       sfm +
       ggtitle("Total number of SNP-genes showing mediation.")

ga <- grid.arrange(gp,gp1,gp2,gp3,gp4,ncol=2)
ggsave(ga, file=fmediation_distr, width=12, height=9)

# not needed anymore - keep only relevant subset
results <- results[,c("graph_type", "cohort", "mediation_selected_significant", "snp_genes_selected",
		      "mediation_notselected_significant","snp_genes_notselected")]
# handle different graph types separately
results <- split(results, f = results$graph_type)

temp <- lapply(names(results), function(n) {
  rsub <- results[[n]]
  cohort <- rsub[,2]

  summary <- lapply(unique(cohort), function(co) {
    # summarize results per cohort
    r <- colSums(rsub[rsub$cohort==co,c(-1, -2)], na.rm=T)

    # proportion of significant genes per group
    sign.selected <- r["mediation_selected_significant"] / r["snp_genes_selected"]
    sign.notselected <- r["mediation_notselected_significant"] / r["snp_genes_notselected"]

    # number of
    toplot <- data.frame(selected=c(1,sign.selected),
                         not.selected=c(1,sign.notselected),
			 cohort=c(co,co))
    rownames(toplot) <- c("total tested", "proportion significant")

    toplot <- melt(cbind(toplot, mediation = rownames(toplot)),
                   id.vars = c('mediation', 'cohort'))
    return(toplot)
  })
  toplot <- do.call(rbind, summary)
  p <- ggplot(toplot, aes(x=variable,y=value, fill=mediation)) +
    geom_bar(position="identity", stat="identity") +
    facet_grid(. ~ cohort) + sfm +
    theme(axis.text.x =
             element_text(angle = 90, hjust = 1, vjust=0)) +
    ylab("percentage of genes") +
    xlab("mediation group") +
    scale_y_continuous(labels=percent_format()) +
    ggtitle(n)
  return(p)
})
theme_update(plot.title = element_text(hjust = 0.5))
ga <- grid.arrange(temp[[1]], temp[[2]], temp[[3]], ncol=3)
ggsave(plot=ga, file=fmediation_perc,
       width=12, height=8)

# ------------------------------------------------------------------------------
# Specificity and sensitivity of gene selection over all sentinels
# Here we look at all sentinels and summarize the selected/not selected SNP
# genes and their mediation values by calculating all TPs, TNs, FPs, and FNs.
# We do this for the individual cohorts as well as summed up over both
# ------------------------------------------------------------------------------
cohorts <- c("kora", "lolipop", "both")
graph_types <- unique(tab$graph_type)
values <- lapply(cohorts, function(cohort) {
  temp <- lapply(graph_types, function(graph_type) {
    if(cohort=="both") {
      cohort2 <- c("kora", "lolipop")
    } else {
      cohort2 <- cohort
    }
    dat <- tab[tab$cohort %in% cohort2 & tab$graph_type == graph_type,
               c("sentinel", "snp_genes", "snp_genes_selected",
                 "mediation_notselected_significant", "mediation_selected_significant")]
    result <- sapply(1:nrow(dat), function(i) {
      d <- dat[i,,drop=F]
      v1 <- d$mediation_notselected_significant==0 & d$snp_genes_selected == 0
      v2 <- d$mediation_notselected_significant==0 & d$snp_genes_selected > 0
      v3 <- d$mediation_notselected_significant>0 & d$snp_genes_selected == 0
      v4 <- d$mediation_selected_significant > 0
      # todo
      v5 <- ""
      return(c(v1,v2,v3,v4))
    })

    result <- matrix(unlist(result), byrow = T, ncol=4)
    result <- colSums(result)
    names(result) <- c("TN", "FP", "FN", "TP")
    result
  })
  names(temp) <- paste(cohort, graph_types, sep=".")
  do.call(rbind, temp)
})
names(values) <- cohorts
values <- as.data.frame(do.call(rbind, values))
values$cohort <- unlist(lapply(strsplit(rownames(values), "\\."), "[[", 1))
values$graph_type <- unlist(lapply(strsplit(rownames(values), "\\."), "[[", 2))

# define performance summary methods
sens <- function(d) { d[,"TP"] / (d[,"TP"] + d[,"FN"]) }
spec <- function(d) { d[,"TN"] / (d[,"TN"] + d[,"FP"]) }
prec <- function(d) { d[,"TP"] / (d[,"TP"] + d[,"FP"]) }
f1 <- function(d) { 2 * 1 / ((1/sens(d)) +
			     (1/prec(d))) }
# ------------------------------------------------------------------------------
# save a simple plot showing the performance values
# ------------------------------------------------------------------------------
df <- values
df$sensitivity <- sens(df)
df$specificity <- spec(df)
df$f1 <- f1(df)

df <- melt(df, measure.vars=c(7,8,9), value.name="performance")
gp1 <- ggplot(data=df, aes(y=performance, x=graph_type, fill=graph_type)) +
	geom_bar(stat="identity") +
	facet_grid(cohort ~ variable) + scale_y_continuous(limits=c(0,1)) +
        sfm + theme(axis.text.x = element_text(vjust=1, angle = 90)) +
	ggtitle("Performance of GGMs on different cohorts",
		"Baseline defined via significant mediation genes. Summary over all sentinels.")

# ------------------------------------------------------------------------------
# Evaluate the individual methods as of how well they reproduce across cohorts
# using MCC
# ------------------------------------------------------------------------------
df <- tab[, c("sentinel", "cohort", "graph_type", "cross_cohort_mcc")]
df <- melt(df, measure.vars=4, value.name="performance")
# show the distribution of MCC values
gp2 <- ggplot(data=df, aes(y=performance, x=graph_type, fill=graph_type)) +
	geom_violin(draw_quantiles=c(.25,.5,.75)) +
	facet_grid(. ~ cohort) +
	sfm + theme(axis.text.x = element_text(vjust=1, angle = 90)) +
	ggtitle("Performance of methods regarding replication across cohorts (MCC).",
		"Only matched nodes in both graphs have been allowed for analysis.")

# show the MCC values w.r.t. to the fraction of retained nodes in the two compared models
df <- tab[, c("sentinel", "cohort", "graph_type", "cross_cohort_mcc", "cross_cohort_mcc_frac")]
#df <- melt(df, measure.vars=4, value.name="performance")
gp3 <- ggplot(data=df, aes(y=cross_cohort_mcc, x=cross_cohort_mcc_frac,
			   color=graph_type,
			   shape=graph_type)) +
        geom_point() +
        facet_grid(cohort ~ .) +
        scale_color_manual(values=cols) + geom_abline(linetype="dotted") +
        geom_hline(linetype="dotted",yintercept=0) + geom_vline(linetype="dotted",xintercept=0) +
        ggtitle("Performance of methods regarding replication across cohorts (MCC).",
                "Shown is the MCC w.r.t. the fraction of nodes retained in the graphs regarding the total number
of nodes available in the data.")

# finalize performance plot and save
ga <- grid.arrange(gp1, gp2, gp3, ncol=2)
ggsave(plot=ga, file=fperf, width=12, height=8)

# ------------------------------------------------------------------------------
# CpG-Gene validation
# ------------------------------------------------------------------------------
#TODO
#hist(tab$bonder_cis_eQTM)

# ------------------------------------------------------------------------------
# Gene-Gene validation
#
# The gene-gene links were validated using external expression data from GEO (ARCHS4),
# as well as the data from either LOLIPOP or KORA, depending on which cohort
# the bGGM was calculated.

# To assess how well our approach recovers co-expression of genes in the independent
# datasets, we calculate all gene-vs-gene (gvg) correlations in those data and then
# obtained the MCC between these networks vs. the ones obtained from the models

# We deem a correlation between two genes to be significant if 1) $p-value < 0.01$
# and 2) $\rho > 0.3$ ($\rho$ being the Pearson Correlation Coefficient).
# ------------------------------------------------------------------------------

# get the relevant data (pvalues on the different datasets)
toplot <- tab[,c("sentinel","cohort", "graph_type",
             "geo_gene_gene","cohort_gene_gene")]
colnames(toplot) <- c("sentinel", "cohort", "graph_type",
                      "GEO", "LOLIPOP_KORA")

# plot the data
df <- melt(toplot,measure.vars=c(4,5), id.vars=c(1,2,3))
gp <- ggplot(df, aes(x=graph_type, y=value, fill=variable)) +
  facet_grid(cohort~variable) + sfm +
  geom_violin(draw_quantiles=c(.25,.5,.75)) +
  theme(axis.text.x = element_text(vjust=1, angle = 90)) +
  xlab("sentinel") + ylab("MCC") +
  ggtitle("Validation of gene networks.",
	  "Shown is the MCC over all loci calculated on gene-networks extracted from models against
the networks obtained from a simple correlation analysis.")

ggsave(plot=gp,
       file=fexpr,
       width=12, height=8)
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
library(magrittr)

fresults <- snakemake@input$results

result <- lapply(fresults, function(f) {
  res <- readr::read_tsv(f) %>%
    dplyr::select(snp, rdegree, comparison, dplyr::everything())

  if (grepl("subset", f)) {
    s <- gsub(".*subset([0-9]+).*", "\\1", f)
    res <- dplyr::mutate(res,
                         subset = s)
  }
  res

}) %>% dplyr::bind_rows()

readr::write_tsv(result, snakemake@output$combined)
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

# ------------------------------------------------------------------------------
print("Load libraries and source scripts")

library(GenomicRanges)
library(pheatmap)
library(doParallel)
library(igraph)
library(graph)

source("scripts/lib.R")
source("scripts/reg_net.R")
source("scripts/simulation/lib.R")

# ------------------------------------------------------------------------------
print("Get snakemake input and load data.")

# inputs
fdata <- snakemake@input$data
fppi_db <- snakemake@input$ppi_db
fcpg_context <- snakemake@input$cpg_context

# outputs
fout <- snakemake@output[[1]]

# params
threads <- snakemake@threads
sim_iter <- as.numeric(snakemake@params$iteration)

# contains: simulations, ranges, priors, nodes, data, runs
load(fdata)
ppi_db <- readRDS(fppi_db)

# ------------------------------------------------------------------------------
# we generated several graphs, for which we all calculate models now

# apply over the different runs/iterations
run <- simulations[[sim_iter]]
result <- run_ggm_prior_completeness(run, priors, ranges, 
                                     fcpg_context, ppi_db, threads)

print("Saving results.")
save(file=fout, result)

# ------------------------------------------------------------------------------
print("SessionInfo:")
# ------------------------------------------------------------------------------
sessionInfo()
sink()
sink(type="message")
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

# ------------------------------------------------------------------------------
print("Load libraries and source scripts")

library(GenomicRanges)
library(pheatmap)
library(doParallel)
library(igraph)
library(graph)

source("scripts/lib.R")
source("scripts/reg_net.R")
source("scripts/simulation/lib.R")

# ------------------------------------------------------------------------------
print("Get snakemake input and load data.")

# inputs
fdata <- snakemake@input$data
fppi_db <- snakemake@input$ppi_db
fcpg_context <- snakemake@input$cpg_context

# outputs
fout <- snakemake@output[[1]]

# params
threads <- snakemake@threads
sim_iter <- as.numeric(snakemake@params$iteration)
subset <- snakemake@wildcards$subset

# contains: simulations, ranges, priors, nodes, data, runs
load(fdata)
ppi_db <- readRDS(fppi_db)

# ------------------------------------------------------------------------------
# we generated several graphs, for which we all calculate models now

# apply over the different runs/iterations
run <- simulations[[sim_iter]]

# for this run, apply over all simulated graphs (different randomization
# degrees)
result <- run_ggm(run, priors, ranges, 
                  fcpg_context, ppi_db, subset, threads)

print("Saving results.")
save(file=fout, result, subset)

# ------------------------------------------------------------------------------
print("SessionInfo:")
# ------------------------------------------------------------------------------
sessionInfo()
sink()
sink(type="message")
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

# ------------------------------------------------------------------------------
print("Prep libraries, scripts and params")

library(igraph)
library(graph)
library(BDgraph)

source("scripts/priors.R")
source("scripts/lib.R")
source("scripts/simulation/lib.R")

# ------------------------------------------------------------------------------
# Snakemake params and inputs

# inputs
fdata <- snakemake@input[["data"]]
franges <- snakemake@input[["ranges"]]
fpriors <- snakemake@input[["priors"]]

# outputs
fout <- snakemake@output[[1]]

# params
sentinel <- snakemake@params$sentinel
threads <- snakemake@threads
runs <- 1:as.numeric(snakemake@params$runs)

# ------------------------------------------------------------------------------
print("Loading data.")

data <- readRDS(fdata)
nodes <- colnames(data)
ranges <- readRDS(franges)
priors <- readRDS(fpriors)

# restrict to priors for which we also have data available
priors <- priors[rownames(priors) %in% nodes, colnames(priors) %in% nodes]

# ------------------------------------------------------------------------------
print(paste0("Running ", length(runs), " simulations."))

# we use bdgraph.sim internally, we need to set the number of threads which it
# uses to avoid threading issues:
RhpcBLASctl::omp_set_num_threads(1)
RhpcBLASctl::blas_set_num_threads(1)

simulations <- mclapply(runs, function(x) {

  set.seed(x)

  # create the hidden and observed graphs
  graphs <- create_prior_graphs(priors, sentinel, threads=1)

  print(paste0("Run ", x, " done."))

  # simulate data for ggm
  return(simulate_data(graphs, sentinel, data, nodes, threads=1))

}, mc.cores = threads)

names(simulations) <- paste0("run_", runs)

# ------------------------------------------------------------------------------
print("Saving results.")

save(file=fout, simulations, priors, ranges, nodes, data,
     runs, fdata, franges, fpriors)

# ------------------------------------------------------------------------------
print("SessionInfo:")
sessionInfo()
sink()
sink(type="message")
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

# ------------------------------------------------------------------------------
print("Load libraries and source scripts")
# ------------------------------------------------------------------------------
library(tidyverse)
library(graph)

# the sentinel to look for
sentinel <- snakemake@wildcards$sentinel

# the directory containing the simulation results
dresults <- snakemake@params$dresults

# the output file
fout <- snakemake@output$summary

threads <- snakemake@threads

# ------------------------------------------------------------------------------
print("Load and process data.")
# ------------------------------------------------------------------------------
library(parallel)

# we use all results from the "full" sample size simulation
finputs <- list.files(dresults, paste0(sentinel, ".*-subsetall.RData"), 
                      full.names = T)

if(length(finputs) < 100) stop("Missing results.")

res <- mclapply(finputs, function(f) {
  print(paste0("Processing ", f, "."))

  n <- load(f) # result object
  if(length(result) == 0) {warning("no results"); return(NULL)}

  # only look at results with full prior information
  result <- result[grepl("_rd0$", names(result))][[1]]

  # get observed graph and extract SNP genes
  gobs <- result$graph.observed
  snp <- result$snp
  if(!snp %in% nodes(gobs)) return(NULL)

  sgenes <- adj(gobs, result$snp)[[1]]

  if(length(sgenes) == 0) return(NULL)

  # iteratie over all inferred graphs and get number of correctly inferred 
  # snp genes
  inferred_graphs <- result$fits[!grepl("_fit$", names(result$fits))]

  lapply(names(inferred_graphs), function(n) {
    ginf <- inferred_graphs[[n]]

    if(!snp %in% nodes(ginf)) return(NULL)

    sgenes_inf <- adj(ginf, result$snp)[[1]]

    # get stats
    tp <- length(intersect(sgenes_inf, sgenes))
    fp <- length(setdiff(sgenes_inf, sgenes))
    acc <- tp / length(sgenes)

    tibble(result$snp, graph_type = n, number_snp_genes = length(sgenes),
           recovered_snp_genes = length(sgenes_inf),
           TP = tp, FP = fp, ACC = acc)

  }) %>% bind_rows()
}, mc.cores = threads) %>% bind_rows()

# plot the results and save table
write_tsv(res, fout)

# ------------------------------------------------------------------------------
print("SessionInfo:")
# ------------------------------------------------------------------------------
sessionInfo()
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
sink(file=snakemake@log[[1]])

# ------------------------------------------------------------------------------
# load needed libraries and source scripts

library(BDgraph)
library(graph)
library(igraph)
library(GenomicRanges)
library(tidyverse)
source("scripts/lib.R")
source("scripts/simulation/lib.R")

# ------------------------------------------------------------------------------
# get snakemake params

# inputs
ffits <- snakemake@input$fits

# outputs
foutput <- snakemake@output[[1]]

# ------------------------------------------------------------------------------
# Perform validation

# load data and get the validation results
tab <- lapply(ffits, function(f) {
  print(paste0("Processing ", basename(f), "."))

  load(f)

  iteration <- gsub(".RData|.txt","", gsub(".*iter", "", f))

  # quick hack to not have to redo output file definitions of other sim parts
  if(any(grepl("prior_completeness", f))) {
    get_validation_table_prior_completeness(result, iteration)
  } else {
    get_validation_table(result, iteration)  
  }

}) %>% bind_rows()

# ------------------------------------------------------------------------------
# save results

write_tsv(path=foutput, x=tab)

# ------------------------------------------------------------------------------
print("SessionInfo:")
# ------------------------------------------------------------------------------
sessionInfo()
sink()
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

# ------------------------------------------------------------------------------
print("Load libraries and source scripts")
# ------------------------------------------------------------------------------
library(tidyverse)
library(parallel)
source("scripts/lib.R")
source("scripts/snipe.R")

fvalidation <- snakemake@input$validation
fgwas <- snakemake@input$gwas
feqtlgen <- snakemake@input$eqtlgen

fout_validation <- snakemake@output$validation

threads <- snakemake@threads

# ------------------------------------------------------------------------------
print("Loading and processing data.")
# ------------------------------------------------------------------------------
gwas <- read_tsv(fgwas, col_types = cols(.default="c")) %>%
  as_tibble(.name_repair="universal") %>%
  separate_rows(SNPS)

# avoid the cluster_sizes column to be parsed as an integer (it's a comma
# separated string)
val <- read_tsv(fvalidation, col_types = cols(.default="c"))

snps <- unique(val$sentinel)

# get gwas traits for all SNPs
traits_per_snp <- mclapply(snps, function(s) {
  return(get_gwas_traits(s, gwas))
}, mc.cores=threads)
names(traits_per_snp) <- snps

val$gwas_disease_trait <- NA
val$gwas_disease_trait <- unlist(traits_per_snp[match(val$sentinel,
                                                names(traits_per_snp))])

# also process the gene list: get cis-eQTL snps and check their GWAS annot
print("Annotating selected genes with eQTL SNP GWAS traits.")
get_eqtl_snps <- function(gene_list, eqtl) {
  eqtl %>% 
    filter(FDR < 0.01 & GeneSymbol %in% gene_list) %>%
    pull(SNP) %>% unique
}
print("Loading eQTLgen data.")
eqtlgen <- read_tsv(feqtlgen)

print("Procesing without SNiPA results.")
traits_per_locus_genes <- mclapply(val$non_snp_genes_selected.list, function(genes) {
  if(!is.na(genes)) {
    gene_list <- strsplit(genes, ";")[[1]]

    snps <- sapply(gene_list, function(g) {
      s <- get_eqtl_snps(g, eqtlgen)  

      # only keep rs-ids, otherwise SNiPA will not work
      s[grepl("^rs", s)]
    })

    traits <- get_gwas_traits(unique(unlist(snps)), gwas, 
                              get.ld.snps = F, collapse = F)
    paste0(unique(traits), collapse = ";")

  } else {
    NA
  }
}, mc.cores=threads)
val$non_snp_genes_selected.gwas_traits <- unlist(traits_per_locus_genes)
print("Done.")

# any of the SNP traits matches one of the gene traits?
print("Checking SNP/Gene trait matches.")
val$gwas_trait_match <- sapply(1:nrow(val), function(i) {
  straits <- val$gwas_disease_trait[i]
  gtraits <- val$non_snp_genes_selected.gwas_traits[i]

  out <- FALSE
  if(!is.na(straits) & !is.na(gtraits)) {
    straits <- strsplit(straits, "\\|")[[1]]
    # split up gene traits, too, to make sure we match complete trait names
    gtraits <- strsplit(gtraits, ";")[[1]]
    if(any(sapply(straits, function(tr) { any(grepl(tr, gtraits)) }))) {
      out <- TRUE
    }
  }
  out
})

# ------------------------------------------------------------------------------
print("Writing output file.")
# ------------------------------------------------------------------------------
write_tsv(val, fout_validation)

# ------------------------------------------------------------------------------
print("SessionInfo:")
# ------------------------------------------------------------------------------
sessionInfo()
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

# define easy concatenation operator
`%+%` = paste0

# ------------------------------------------------------------------------------
print("Load libraries and source scripts.")
# ------------------------------------------------------------------------------
library(BDgraph)
library(igraph)
library(graph)
library(data.table)
library(GenomicRanges)
library(qvalue)
library(illuminaHumanv3.db)
library(ggplot2)
library(parallel)
library(reshape2)
library(cowplot)

source("scripts/go-enrichment.R")
source("scripts/validation_methods.R")
source("scripts/mediation_methods.R")
source("scripts/lib.R")
source("scripts/reg_net.R")
source("scripts/reg_net_utils.R")

# ------------------------------------------------------------------------------
print("Get snakemake parameters.")
# ------------------------------------------------------------------------------

# inputs
fkora_data <- snakemake@input[["kora_data"]]
franges <- snakemake@input[["ranges"]]
flolipop_data <- snakemake@input[["lolipop_data"]]
fkora_fit <- snakemake@input[["kora_fit"]]
flolipop_fit <- snakemake@input[["lolipop_fit"]]

fgtex <- snakemake@input[["gtex"]]
fgeo <- snakemake@input[["geo"]]
fciseqtl_kora <- snakemake@input[["cis_kora"]]
ftranseqtl_kora <- snakemake@input[["trans_kora"]]
fbonder_eqtm <- snakemake@input[["bonder_eqtm"]]
fciseqtl_joehanes <- snakemake@input[["cis_joehanes"]]
ftranseqtl_joehanes <- snakemake@input[["trans_joehanes"]]
fmediation_kora <- snakemake@input[["mediation_kora"]]
fmediation_lolipop <- snakemake@input[["mediation_lolipop"]]

# params
threads <- snakemake@threads
sentinel <- snakemake@wildcards$sentinel
mediation_cutoff <- snakemake@params$mediation_cutoff
cohort <- snakemake@wildcards$cohort

# output
# main outfile for validation results
fout <- snakemake@output[[1]]

# ------------------------------------------------------------------------------
print("Loading data.")
# ------------------------------------------------------------------------------

print("Loading gene expression data.")

# load GEO/ARCHS4  data
geo <- fread(fgeo,
             header = T, sep = "\t")
colnames(geo)[1] <- "symbol"
setkey(geo, symbol)

print("Loading Joehanes eQTL.")
ceqtl <- fread(fciseqtl_joehanes,
               sep = ",")
# get only cis eqtls defined in the paper
ceqtl <- ceqtl[ceqtl$Is_Cis == 1]
print(paste0("Loaded ", nrow(ceqtl), " cis-eQTL"))

# trans eQTL
teqtl <- fread(ftranseqtl_joehanes,
               sep = ",")
print(paste0("Loaded ", nrow(teqtl), " trans-eQTL"))

# load the bonder cis-eQTMs for cpg-gene validation
eqtms <- fread(fbonder_eqtm, data.table = F)

# ------------------------------------------------------------------------------
print("Loading cohort data.")
# ------------------------------------------------------------------------------
kdata <- readRDS(fkora_data)
ldata <- readRDS(flolipop_data)

# remove (rare) all-NA cases. This can happen due to scaling of all-zero entities,
# which can arise due to a very large number of cis-meQTLs which effects get
# removed from the CpGs during data preprocessing.
# NOTE: we could possibly handle this differently? Seems that these particular
# cpgs are highly influenced by genetic effects?
use <- apply(kdata,2,function(x) (sum(is.na(x)) / length(x)) < 1)
kdata <- kdata[,use]
use <- apply(ldata,2,function(x) (sum(is.na(x)) / length(x)) < 1)
ldata <- ldata[,use]

# load ranges
ranges <- readRDS(franges)

# ------------------------------------------------------------------------------
print("Loading GGM fits.")
# ------------------------------------------------------------------------------
kfit <- readRDS(fkora_fit)
lfit <- readRDS(flolipop_fit)

fits <- list(kora = kfit, lolipop = lfit)

# we get the according dataset on which to validate
if ("lolipop" %in% cohort) {
  # ggm calculated on lolipop, validate on kora
  data_val <- kdata
  data_fit <- ldata
  cohort_val <- "kora"
} else {
  # assume kora cohort, validate on lolipop
  data_val <- ldata
  data_fit <- kdata
  cohort_val <- "lolipop"
}

graph_types <- c("bdgraph", "bdgraph_no_priors",
                 "genenet", "irafnet",
                 "glasso", "glasso_no_priors", "genie3")

# validate all graph models
# NOTE: although we used multi-threading before, it seems that this results in
# problems when integrating the GO enrichment (cryptic database disk image errors)
# Therefore we only use one thread for now, but run a lot of distinct jobs, this
# seems to do the trick
valid <- mclapply(graph_types, function(graph_type) {
  print(paste0("Validating ", cohort, " fit for '", graph_type , "' graph fit."))

  row <- c(sentinel = sentinel,
           cohort = cohort,
           graph_type = graph_type)

  # ----------------------------------------------------------------------------
  print("Preparing fit.")
  # ----------------------------------------------------------------------------
  graph <- fits[[cohort]][[graph_type]]

  # dnodes -> full set of possible nodes
  dnodes <- colnames(data_val)

  # ----------------------------------------------------------------------------
  print("Getting basic stats (number nodes, number edges, graph density)")
  # ----------------------------------------------------------------------------
  nn <- numNodes(graph)
  ne <- numEdges(graph)
  gd <- (ne * 2) / (nn * (nn - 1))
  row <- c(row, number_nodes=nn, number_edges=ne, graph_density=gd)

  # ----------------------------------------------------------------------------
  print("Getting cluster information")
  # ----------------------------------------------------------------------------
  # get all clusters in the graph
  ig = igraph::graph_from_graphnel(graph)
  cl = clusters(ig)
  ncluster <- cl$no
  scluster <- paste(cl$csize, collapse = ",")

  # remember snp membership
  snp_cluster <- NA
  snp_cluster_size <- NA
  if (sentinel %in% names(cl$membership)) {
    snp_cluster <- cl$membership[sentinel]
    snp_cluster_size <- cl$csize[snp_cluster]
  }

  row <- c(row,
           cluster=ncluster,
           cluster_sizes=scluster,
           snp_cluster=unname(snp_cluster),
           snp_cluster_size=snp_cluster_size)

  # ----------------------------------------------------------------------------
  print("Getting largest CC for validation.")
  # ----------------------------------------------------------------------------
  keep <- cl$membership == which.max(cl$csize)
  keep <- names(cl$membership[keep])
  if(!is.null(keep)) {
    graph_maxcluster <- graph::subGraph(keep, graph)
  }

  # the nodes retained in the fitted graph model in the largest CC
  gnodes <- graph::nodes(graph_maxcluster)

  # ----------------------------------------------------------------------------
  print("Calculating graph score.")
  # ----------------------------------------------------------------------------
  # we use the (full) igraph object for this, will be filtered for the sentinel
  # cluster
  score <- get_graph_score(ig, sentinel, ranges, gd)
  row <- c(row,
           graph_score = score)

  # ----------------------------------------------------------------------------
  print("Defining entity sets (selected / not selected)")
  # ----------------------------------------------------------------------------

  # get names of all entities, total and selected by ggm
  snp <- sentinel
  data_val[, snp] <- as.integer(as.character(data_val[, snp]))
  data_fit[, snp] <- as.integer(as.character(data_fit[, snp]))

  if(ranges$seed == "meqtl") {
    trans_entities <- intersect(dnodes, names(ranges$cpgs))
  } else {
    trans_entities <- intersect(dnodes, ranges$trans_genes$SYMBOL)
  }
  trans_entities_selected <- trans_entities[trans_entities %in% gnodes]

  all_genes <- dnodes[!grepl("^rs|^cg", dnodes)]
  sgenes <- intersect(dnodes, ranges$snp_genes$SYMBOL)
  if (snp %in% gnodes) {
    sgenes_selected <- sgenes[sgenes %in% unlist(adj(graph_maxcluster, snp))]

    # only proceed if we have at least one selected snp gene and trans entity
    if(length(sgenes_selected) > 0 & length(trans_entities_selected) > 0) {

      # collection of 'real' selected SNP genes (with path to one of the trans
      # entities)
      temp_genes <- c()

      # check each snp gene individually
      for(sg in sgenes_selected) {
        # snp genes have to be connected to trans entities without traversing
        # the snp itself and must not go via another snp gene
        # -> create subgraph without those entities
        sg_other <- setdiff(sgenes_selected, sg)
        graph_temp <- subGraph(setdiff(gnodes, c(snp, sg_other)),
                               graph_maxcluster)
        igraph_temp <- igraph::graph_from_graphnel(graph_temp)

        # get paths
        paths <-
          suppressWarnings(get.shortest.paths(igraph_temp, sg,
                                              intersect(V(igraph_temp)$name,
                                                        trans_entities_selected)))$vpath[[1]]

        # did we find a path with the current SNP gene?
        temp_genes <- unique(c(temp_genes, intersect(sg, paths$name)))
      }
      sgenes_selected <- temp_genes
    } else {
      sgenes_selected <- c()
    }
  } else {
    sgenes_selected <- c()
  }

  # cpg genes and TFs
  cgenes <- intersect(dnodes, ranges$cpg_genes$SYMBOL)
  # TODO also check TF to cpg-gene association..
  tfs <- intersect(dnodes, ranges$tfs$SYMBOL)

  if(ranges$seed == "meqtl") {
    # selected cpg genes and TFs
    if (length(trans_entities_selected) > 0) {
      cgenes_selected <- cgenes[cgenes %in% unlist(adj(graph_maxcluster,
                                                       trans_entities_selected))]
    } else {
      cgenes_selected <- c()
    }
  } else {
    cgenes_selected <- c()
  }

  if(length(trans_entities_selected) > 0) {
    tfs_selected <- tfs[tfs %in% unlist(adj(graph_maxcluster,
                                            trans_entities_selected))]
  } else {
    tfs_selected <- c()
  }

  # the shortest path genes
  spath <- ranges$spath$SYMBOL
  spath_selected <- spath[spath %in% gnodes]

  # collection of non snp genes which got selected
  non_snp_genes_selected <- c(tfs_selected, spath_selected)
  if(ranges$seed == "meqtl") {
    non_snp_genes_selected <- c(non_snp_genes_selected, cgenes_selected)
  } else {
    non_snp_genes_selected <- c(non_snp_genes_selected, trans_entities_selected)
  }
  non_snp_genes_selected <- unique(non_snp_genes_selected)

  # add to row
  row <- c(
    row,
    snp_genes=length(sgenes),
    snp_genes_selected=length(sgenes_selected),
    snp_genes.list=paste(sgenes, collapse = ";"),
    snp_genes_selected.list=paste(sgenes_selected, collapse = ";"),
    trans_entities = length(trans_entities),
    trans_entities_selected = length(trans_entities_selected),
    cpg_genes=length(cgenes),
    cpg_genes_selected=length(cgenes_selected),
    tfs=length(tfs),
    tfs_selected=length(tfs_selected),
    spath=length(spath),
    spath_selected=length(spath_selected),
    non_snp_genes_selected.list=paste0(non_snp_genes_selected, collapse=";"),
    tfs_per_trans=length(tfs)/length(trans_entities),
    tfs_per_trans_selected=length(tfs_selected)/length(trans_entities_selected)
  )

  # ------------------------------------------------------------------------------
  print("Using MCC to check how well graph replicated across cohorts.")
  # ------------------------------------------------------------------------------
  # get graph fit on other cohort
  graph_val <- fits[[cohort_val]][[graph_type]]

  replication <- get_graph_replication_f1_mcc(graph, graph_val)

  if (!is.null(replication)) {
    f1 <- replication$F1
    mcc <- replication$MCC

    print(paste0("MCC: ", format(mcc, digits = 3)))
    print(paste0("F1: ", format(f1, digits = 3)))

    # the fraction of nodes retained in the overlap w.r.t. to the
    # total number of possible nodes
    common_nodes <- replication$number_common_nodes
    mcc_frac <- common_nodes / ncol(data_val)
    row <- c(row, cross_cohort_f1=f1, cross_cohort_mcc=mcc,
             cross_cohort_mcc_frac=mcc_frac,
             common_nodes=common_nodes)
  } else {
    row <- c(row, cross_cohort_f1=NA, cross_cohort_mcc=NA,
             cross_cohort_mcc_frac=NA,
             common_nodes=NA)
  }

  # ------------------------------------------------------------------------------
  print("Checking mediation.")
  # ------------------------------------------------------------------------------
  if ("kora" %in% cohort) {
    med_val <- readRDS(fmediation_lolipop)
    med_fit <- readRDS(fmediation_kora)
  } else {
    med_val <- readRDS(fmediation_kora)
    med_fit <- readRDS(fmediation_lolipop)
  }
  row <-
    c(row,
      mediation.summary(med_val, sgenes, sgenes_selected, mediation_cutoff))

  # we also check the correspondence of the correlation values for all genes
  med_comparison <- compare_mediation_results(
    sentinel,
    med_val,
    med_fit,
    sgenes_selected,
    mediation_cutoff
  )

  row <- c(
    row, med_comparison
  )

  # ----------------------------------------------------------------------------
  print("Validating cis-eQTLs.")
  # ----------------------------------------------------------------------------

  # filter ceqtl to be only related to our sentinel SNP
  # TODO use proxy/high ld snps to increase ceqtl number?
  ceqtlsub <- ceqtl[ceqtl$Rs_ID %in% snp]
  if (nrow(ceqtlsub) < 1) {
    warning("Sentinel " %+% sentinel %+% " not found in cis-eQTL data")
    # report NA in stats file
    row <- c(row, cisEqtl=NA)
  } else {
    ceqtl_sgenes <- sgenes[sgenes %in% unlist(strsplit(ceqtlsub$Transcript_GeneSymbol, "\\|"))]
    ceqtl_sgenes_selected <-
      intersect(ceqtl_sgenes, sgenes_selected)

    # create matrix for fisher test
    cont <-
      matrix(
        c(
          length(ceqtl_sgenes),
          length(ceqtl_sgenes_selected),
          length(sgenes),
          length(sgenes_selected)
        ),
        nrow = 2,
        ncol = 2,
        byrow = T
      )
    rownames(cont) <- c("ceqtl", "no ceqtl")
    colnames(cont) <- c("not selected", "selected")
    row <- c(row,
             cisEqtl=fisher.test(cont)$p.value)
  }

  # ----------------------------------------------------------------------------
  print("CpG-gene and TF-CpG validation.")
  # ----------------------------------------------------------------------------
  # filter teqtl to be only related to our sentinel SNP
  # TODO use proxy/high ld snps to increase teqtl number?
  teqtlsub <-
    teqtl[teqtl$Rs_ID %in% snp]# & (teqtl$log10FDR) < (-2),,drop=F]
  if (nrow(teqtlsub) < 1) {
    warning("Sentinel " %+% sentinel %+% " not available in trans-eQTL data.")
    # report NA in stats file
    row <- c(row, transEqtl_tgenes=NA, transEqtl_tfs=NA)
  } else {
    if(ranges$seed == "meqtl") {
      row <- c(row,
               validate_trans_genes(teqtlsub, cgenes, tfs,
                                    cgenes_selected, tfs_selected))
    } else {
      row <- c(row,
               validate_trans_genes(teqtlsub, trans_entities, tfs,
                                    trans_entities_selected, tfs_selected))
    }
  }

  # we can also count how many of the identified cpg
  # genes are eQTMs in the bonder dataset
  # first convert bonder ensembl ids to symbols
  # (bonder results already filtered for sign. assoc.)
  bnd <- eqtms[, c("SNPName", "HGNCName")]
  bnd_symbol <- unique(bnd$HGNCName)
  row <-
    c(row, validate_cpggenes(bnd_symbol, cgenes, cgenes_selected))

  # ----------------------------------------------------------------------------
  print("Gene-Gene validation.")
  # ----------------------------------------------------------------------------
  # load geo whole blood expression data
  geo_sym <- all_genes[all_genes %in% geo$symbol]
  geosub <- geo[geo_sym]
  geosub <- t(geosub[, -1, with = F])
  colnames(geosub) <- geo_sym

  # list of all expr datasets
  expr.data <- list(geo = geosub, cohort = data_val[, all_genes, drop = F])
  val_g2g <- validate_gene2gene(expr.data, graph_maxcluster, all_genes)
  names(val_g2g) <- c("geo_gene_gene", "cohort_gene_gene")
  row <- c(row, val_g2g)

  return(row)
}, mc.cores=threads)

# write output file
valid <- do.call(rbind, valid)
if(is.null(dim(valid)) || ncol(valid) < 2) {
  stop("Error: wrong result dimensions.")
}

write.table(file=fout, valid, col.names=T,
            row.names=F, quote=F, sep="\t")

# ------------------------------------------------------------------------------
print("SessionInfo:")
# ------------------------------------------------------------------------------
sessionInfo()
sink()
sink(type="message")
26
27
script:
	"scripts/test_tfa_inference.R"
93
94
script:
	"scripts/convert_cpg_context.R"
141
142
script:
	"scripts/create_priors.R"
SnakeMake From line 141 of master/Snakefile
165
166
script:
	"scripts/create_priors_eqtlgen.R"
SnakeMake From line 165 of master/Snakefile
187
188
script:
	"scripts/create_ppi_db.R"
SnakeMake From line 187 of master/Snakefile
202
203
script:
	"scripts/create-cosmo-splits.R"
SnakeMake From line 202 of master/Snakefile
229
230
script:
	"scripts/collect_ranges.R"
SnakeMake From line 229 of master/Snakefile
256
257
script:
        "scripts/collect_ranges_eqtl.R"
SnakeMake From line 256 of master/Snakefile
276
277
script:
	"scripts/create_locus_summary.R"
SnakeMake From line 276 of master/Snakefile
291
292
script:
        "scripts/annotate_tss_with_tf.R"
SnakeMake From line 291 of master/Snakefile
314
315
script:
	"scripts/calculate_tfa.R"
SnakeMake From line 314 of master/Snakefile
344
345
script:
	"scripts/collect_data_tfa.R"
SnakeMake From line 344 of master/Snakefile
366
367
script:
        "scripts/create_data_summary.R"
SnakeMake From line 366 of master/Snakefile
397
398
script:
	"scripts/collect_priors.R"
SnakeMake From line 397 of master/Snakefile
433
434
script:
        "scripts/prepare_magma_inputs.R"
SnakeMake From line 433 of master/Snakefile
452
453
script:
        "scripts/prepare_magma_inputs_gtex.R"
SnakeMake From line 452 of master/Snakefile
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
script:
        """
        touch {log}
        echo "Extracting locations" >> {log}
        # extract snp positions to be used for magma annotation (bim format)
        grep -v SNP {input.gwas} | awk '{{OFS="\t"}} {{print $2,$1,0,$3,$4,$5}}' \
          > {output.snp_locs}

        echo "Extracting p-values" >> {log}
        # extract SNP/pvalue file for gene level summaries
        echo -e "SNP\tP" > {output.snp_pvalues}
        grep -v SNP {input.gwas} | cut -f 1,6 >> \
          {output.snp_pvalues}

        echo "Done" >> {log}
        """
503
504
shell:
        "scripts/convert_lbm_gwas_to_bim.R"
SnakeMake From line 503 of master/Snakefile
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
        shell:
                """
		touch {log}

                # prepare geneset file for enrichment
                echo -n "{wildcards.sentinel} " > {output.gene_set}
                cut -f 1 {input.gene_locs} | sed -z "s/\\n/ /g;s/ $/\\n/" \
                  >> {output.gene_set}

                # full gene_annot locs
                magma --annotate \
                  --snp-loc {input.snp_locs} \
                  --gene-loc {input.gene_annot_locs} \
                  --out {params.all_gene_prefix} &>> {log}

                # get entity level summaries for all genes
                magma --bfile data/current/magma_reference_files/g1000_eur \
                  --pval {input.snp_pvalues} N={params.study_size} \
                  --gene-model snp-wise={params.snp_summary} \
                  --gene-annot {params.all_gene_prefix}.genes.annot \
                  --out {params.all_gene_prefix}.P{params.snp_summary} &>> {log}

                # perform enrichment (one-sided and two-sided)
		echo "Two-sided enrichment -----------------------" >> {log}
                magma --gene-results {params.all_gene_prefix}.P{params.snp_summary}.genes.raw \
                  --set-annot {output.gene_set} \
                  --model direction-sets=both \
                  --out {params.all_gene_prefix}.P{params.snp_summary}_twosided &>> {log}

		echo "One-sided enrichment -----------------------" >> {log}
                magma --gene-results {params.all_gene_prefix}.P{params.snp_summary}.genes.raw \
                  --set-annot {output.gene_set} \
                  --out {params.all_gene_prefix}.P{params.snp_summary} &>> {log}
                """
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
        shell:
                """
		touch {log}

                # prepare geneset file for enrichment
                echo -n "{wildcards.sentinel} " > {output.gene_set}
                cut -f 1 {input.gene_locs} | sed -z "s/\\n/ /g;s/ $/\\n/" \
                  >> {output.gene_set}

                # full gene_annot locs
                magma --annotate \
                  --snp-loc {input.snp_locs} \
                  --gene-loc {input.gene_annot_locs} \
                  --out {params.all_gene_prefix} &>> {log}

                # get entity level summaries for all genes
                magma --bfile data/current/magma_reference_files/g1000_eur \
                  --pval {input.snp_pvalues} N={params.study_size} \
                  --gene-model snp-wise={params.snp_summary} \
                  --gene-annot {params.all_gene_prefix}.genes.annot \
                  --out {params.all_gene_prefix}.P{params.snp_summary} &>> {log}

                # perform enrichment (one-sided and two-sided)
		echo "Two-sided enrichment -----------------------" >> {log}
                magma --gene-results {params.all_gene_prefix}.P{params.snp_summary}.genes.raw \
                  --set-annot {output.gene_set} \
                  --model direction-sets=both \
                  --out {params.all_gene_prefix}.P{params.snp_summary}_twosided &>> {log}

		echo "One-sided enrichment -----------------------" >> {log}
                magma --gene-results {params.all_gene_prefix}.P{params.snp_summary}.genes.raw \
                  --set-annot {output.gene_set} \
                  --out {params.all_gene_prefix}.P{params.snp_summary} &>> {log}
                """
ShowHide 40 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/jhawe/bggm
Name: bggm
Version: 1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: None
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...