Workflow Steps and Code Snippets

2 tagged steps and code snippets that match keyword FDb.InfiniumMethylation.hg19

Code used for the manuscript 'Network reconstruction for trans acting genetic loci using multi-omics data and prior information' by Hawe et al., 2022 in Genome Medicine

 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

# ------------------------------------------------------------------------------
# load packages, source scripts
# ------------------------------------------------------------------------------
library(GenomicRanges)
library(GenomicFeatures)
library(FDb.InfiniumMethylation.hg19)
library(data.table)
library(illuminaHumanv3.db)
library(rtracklayer)
library(graph)
library(RBGL) # for shortest paths
library(Matrix)
source("scripts/lib.R")
source("scripts/collect_ranges_methods.R")

# ------------------------------------------------------------------------------
# Get snakemake params
# ------------------------------------------------------------------------------
fcosmo <- snakemake@input$tcosmo
fmeqtl <- snakemake@input$meqtl
fppi_db <- snakemake@input$ppi_db
fprio_tab <- snakemake@input$priorization
fgene_annot <- snakemake@input$gene_annot

# TODO: create this file from scratch!
fcpgcontext <- snakemake@input$cpgcontext

ofile <- snakemake@output[[1]]
sentinel <- snakemake@wildcards$sentinel

# ------------------------------------------------------------------------------
# Load and preprocess data
# ------------------------------------------------------------------------------

print("Loading data.")

gene_annot <- load_gene_annotation(fgene_annot)
gene_annot$ids <- probes.from.symbols(gene_annot$SYMBOL,
                                           as_list=T)
ppi_db <- readRDS(fppi_db)

# load trans-meQTL table
trans_meQTL = read.csv(fmeqtl,
                       sep="\t", stringsAsFactors=F)

# load trans-cosmo information
cosmo <- readRDS(fcosmo)


# load priorization table
prio <- read.table(fprio_tab, sep="\t", header=T, stringsAsFactors = F)

# get trans-genes which should be used for shortest path extraction
prio <- prio[prio$sentinel == sentinel,,drop=F]
if(nrow(prio) > 0) {
  best_trans <- unique(prio$trans.gene)
} else {
  best_trans <- NULL
}

# ------------------------------------------------------------------------------
# Collect and save ranges
# ------------------------------------------------------------------------------

# ------------------------------------------------------------------------------
print("Collecting SNP and CpG ranges.")
# ------------------------------------------------------------------------------

pairs = which(trans_meQTL[,"sentinel.snp"] == sentinel)

# get sentinel idx
sentinel_idx <- which(cosmo$snp==sentinel)[1]

# the large interval range for the sentinel
chr <- paste0("chr", trans_meQTL[pairs,"chr.snp"][1])
start <- trans_meQTL[pairs,"interval.start.snp"][1]
end <- trans_meQTL[pairs,"interval.end.snp"][1]

sentinel_extrange <- GRanges(chr, IRanges(start,end))
sentinel_range <- with(cosmo[sentinel_idx,],
                       GRanges(paste0("chr", snp.chr),
                               IRanges(snp.pos, width=1)))
names(sentinel_range) <- names(sentinel_extrange) <- sentinel

# get cosmo subset
idxs <- get.trans.cpgs(sentinel, trans_meQTL, cosmo)

# get related genes, i.e. genes near meQTL loci (snp+cpg)

# extend cpgs
cosmosub <- cosmo[idxs,]

croi <- with(cosmosub, GRanges(paste0("chr", cpg.chr),
                               IRanges(cpg.pos,width=2)))
names(croi) <- as.character(cosmosub[,"cpg"])
croi <- unique(croi)

# extended sentinel region
sroi <- sentinel_extrange
names(sroi) <- sentinel

# ------------------------------------------------------------------------------
print("Retrieving SNP and CpG genes.")
# ------------------------------------------------------------------------------

# get the relevant snp genes by overlapping with our sentinel region
genes_sroi <- subsetByOverlaps(gene_annot, sroi, ignore.strand=T)

# get genes near our cpg regions
genes_by_cpg <- get.nearby.ranges(croi, promoters(gene_annot))
names(genes_by_cpg) <- names(croi)
# get original ranges (not promoters)
genes_by_cpg <- lapply(names(genes_by_cpg), function(cg) {
  gs <- genes_by_cpg[[cg]]
  gene_annot[gs$hit_idx]
})
names(genes_by_cpg) <- names(croi)

# get single list of all cpg genes
genes_croi <- unique(unlist(GRangesList(unlist(genes_by_cpg))))

# ------------------------------------------------------------------------------
print("Collecting TFs and shortest path genes.")
# ------------------------------------------------------------------------------

tfs <- NULL
sp <- NULL

# get cpg ids and SNP gene symbols
cpgs <- names(croi)
snp_genes <- unique(genes_sroi$SYMBOL)

# modify ppi_db to contain our CpGs

# load the cpg-tf context
tfbs_ann <- get_tfbs_context(names(croi), fcpgcontext)
cpgs_with_tfbs <- cpgs[cpgs %in% rownames(tfbs_ann[rowSums(tfbs_ann)>0,])]
snp_genes_in_string <- snp_genes[snp_genes %in% nodes(ppi_db)]

# get locus graph
locus_graph <- add.to.graphs(list(ppi_db), sentinel, snp_genes,
                             cpgs_with_tfbs, tfbs_ann)[[1]]

# get tfs connected to cpgs
tf_syms = unique(unlist(adj(locus_graph, cpgs_with_tfbs)))
print(paste0("Annotated TFs: ", paste(tf_syms, collapse=", ")))

if(length(tf_syms) < 1 | length(snp_genes_in_string) < 1) {
  warning(paste0("No TFs or none of the SNP genes are in PPI DB. ",
                 "Skipping shortest paths calculation."))
  # still, we want to keep the available TFs if there are no SNP genes 
  # within the PPI DB (would get adjusted using shortest paths below)
  if(length(snp_genes_in_string) >= 1) {
    tfs <- gene_annot[gene_annot$SYMBOL %in% tf_syms]
  }

} else {
  # the nodes we want to keep
  # in the original meQTL paper we removed KAP1 from the TF symbols
  nodeset <- c(nodes(ppi_db), tf_syms,
               snp_genes_in_string, cpgs_with_tfbs)
  locus_graph <- subGraph(intersect(nodes(locus_graph), nodeset), locus_graph)

  shortest_paths <- get_shortest_paths(cis = cpgs_with_tfbs,
                                       trans=unique(c(snp_genes_in_string,
                                                   tf_syms)),
                           snp_genes=snp_genes_in_string,
                           locus_graph,
                           tf_syms,
                           best_trans)

  non_tf_sp <- shortest_paths$non_tf_sp
  tf_sp <- shortest_paths$tf_sp

  if(length(non_tf_sp) < 1){
    warning("No shortest path genes.")
  } else {
    sp <- gene_annot[gene_annot$SYMBOL %in% non_tf_sp]
  }
  if(length(tf_sp) < 1) {
    # This should not happen -> sanity check
    stop("No TFs on shortest paths!")
  } else {
    tfs <- gene_annot[gene_annot$SYMBOL %in% tf_sp]
  }
}
print(paste0("Annotated TFs after shortest path calculations: ", 
             paste(tfs$SYMBOL, collapse=", ")))

# ------------------------------------------------------------------------------
print("Finalizing and saving results.")
# ------------------------------------------------------------------------------
result <-  list(cpgs=croi,sentinel=sentinel_range,
                sentinel_ext_range=sentinel_extrange,
                snp_genes=genes_sroi, cpg_genes=genes_croi,
                cpg_genes_by_cpg=genes_by_cpg)

if(!is.null(sp)){
  result$spath <- sp
}
if(!is.null(tfs)){
  result$tfs <- tfs
}
# set seed name
result$seed <- "meqtl"

saveRDS(file=ofile, result)

# ------------------------------------------------------------------------------
print("SessionInfo:")
# ------------------------------------------------------------------------------
sessionInfo()
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

# ------------------------------------------------------------------------------
# Load libraries and source scripts
# ------------------------------------------------------------------------------
suppressPackageStartupMessages(library(qvalue))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(graph))
suppressPackageStartupMessages(library(parallel))
suppressPackageStartupMessages(library(fdrtool))
suppressPackageStartupMessages(library(Homo.sapiens))
suppressPackageStartupMessages(library(rtracklayer))
suppressPackageStartupMessages(library(FDb.InfiniumMethylation.hg19))
source("scripts/lib.R")
source("scripts/priors.R")

# ------------------------------------------------------------------------------
# Get snakemake params
# ------------------------------------------------------------------------------

# inputs
feqtl <- snakemake@input[["eqtl"]]
fsnpinfo <- snakemake@input[["snpinfo"]]
fexpr <- snakemake@input[["expr"]]
fsampleinfo <- snakemake@input[["sampleinfo"]]
fpheno <- snakemake@input[["pheno"]]
fppi <- snakemake@input[["ppi"]]
dplots <- snakemake@params$plot_dir

# outputs
fout_gene_priors <- snakemake@output$gene_priors
fout_eqtl_priors <- snakemake@output$eqtl_priors

# ------------------------------------------------------------------------------
# Start processing
# ------------------------------------------------------------------------------

print("Loading PPI db.")
ppi_db <- readRDS(fppi)

# simply delegate
create_priors(
  feqtl,
  fsnpinfo,
  fexpr,
  fsampleinfo,
  fpheno,
  dplots,
  ppi_db,
  fout_gene_priors,
  fout_eqtl_priors
)

if (FALSE) {
  # ------------------------------------------------------------------------------
  print("Prepare the Banovich based priors, i.e. TF-CpG priors.")
  # ------------------------------------------------------------------------------
  # methylation data
  meth <-
    fread("data/current/banovich-2017/methylation/full_matrix.txt",
          data.table = F)
  rownames(meth) <- meth$V1
  meth$V1 <- NULL
  cpgs <- features(FDb.InfiniumMethylation.hg19)
  cpgs <- cpgs[rownames(meth)]

  # expression data
  expr <-
    read.table(
      "data/current/banovich-2017/xun_lan/allTFexp.withHeader",
      header = T,
      sep = "\t",
      stringsAsFactors = F
    )

  # apparently the table contains duplicated entries, remove them
  expr <- expr[!duplicated(expr), ]
  rownames(expr) <- unique(expr[, 1])

  samples <- intersect(colnames(expr), colnames(meth))

  expr <- t(expr[, samples])
  meth <- t(meth[, samples])

  # ------------------------------------------------------------------------------
  print("Get (our) chip-seq context for the cpgs.")
  # ------------------------------------------------------------------------------
  tfbs_ann <- get_tfbs_context(names(cpgs), fcpgcontext)

  # ------------------------------------------------------------------------------
  print("For each TF, get the correlation to each of the CpGs it is bound nearby")
  # ------------------------------------------------------------------------------
  pairs <- lapply(colnames(expr), function(tf) {
    # get columns for tf
    sub <-
      tfbs_ann[, grepl(tf, colnames(tfbs_ann), ignore.case = T), drop = F]
    rs <- rowSums(sub)
    bound_cpgs <- names(rs[rs > 0])

    assoc <- unlist(mclapply(bound_cpgs, function(c) {
      cor.test(expr[, tf],
               meth[, c],
               method = "pearson")$p.value
    }, mc.cores = threads))

    cbind.data.frame(
      TF = rep(tf, length(assoc)),
      CpG = bound_cpgs,
      rho = assoc,
      stringsAsFactors = F
    )
  })

  # ------------------------------------------------------------------------------
  print("Collect and finalize results.")
  # ------------------------------------------------------------------------------
  tab <- do.call(rbind, pairs)
  colnames(tab) <- c("TF", "CpG", "pval")
  tab$qval <- qvalue(tab$pval)$lfdr
  tab$prior <- 1 - tab$qval
  head(tab)
  write.table(
    file = "results/current/tf-cpg-prior.txt",
    sep = "\t",
    col.names = NA,
    row.names = T,
    quote = F,
    tab
  )
}
# ------------------------------------------------------------------------------
print("Session info:")
# ------------------------------------------------------------------------------
sessionInfo()
data / bioconductor

FDb.InfiniumMethylation.hg19

Annotation package for Illumina Infinium DNA methylation probes: Compiled HumanMethylation27 and HumanMethylation450 annotations