Workflow Steps and Code Snippets

3 tagged steps and code snippets that match keyword BSgenome.Hsapiens.UCSC.hg19

Reproducible reanalysis of a combined ChIP-Seq & RNA-Seq data set

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
suppressMessages({
    library(rtracklayer)
    library(stringr)
    library(assertthat)
    library(dplyr)
    library(BSgenome.Hsapiens.UCSC.hg19)
})

{
    chainfile <- snakemake@input[["chain"]]
    chain <- import.chain(chainfile)

    outfile <- snakemake@output[[1]]
    assert_that(is.character(outfile))

    mySession <- browserSession()
    genome(mySession) <- "hg19"
    sites.table <- getTable(ucscTableQuery(mySession, track = "tfbsConsSites", table = "tfbsConsSites")) %>%
        fac2char
    names.table <- getTable(ucscTableQuery(mySession, track = "tfbsConsSites", table = "tfbsConsFactors")) %>%
        fac2char
    ## Keep only human entries, interpret "N" as NA
    names.table$id[names.table$id == "N"] <- NA
    names.table %<>% filter(species == "human") %>% select(-species) %>% droplevels %>%
        group_by(name) %>% summarize_all(. %>% str_c(collapse = ","))
    assert_that(!anyDuplicated(names.table$name))
    full.table <- sites.table %>% inner_join(names.table, "name")

    gr <- makeGRangesFromDataFrame(full.table, start.field = "chromStart", end.field = "chromEnd",
                                   starts.in.df.are.0based = TRUE, keep.extra.columns = TRUE,
                                   seqinfo = seqinfo(BSgenome.Hsapiens.UCSC.hg19))
    gr.lifted <- liftOverLax(gr, chain, allow.gap = 2) %>% .[lengths(.) == 1] %>% unlist
    save_RDS_or_RDA(gr.lifted, outfile)
}

DeepG4ToolsComparison: A snakemake pipeline to run and evaluate G4’s DNA prediction tools

 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
require(Biostrings)
if (!require(plyranges))
{
  BiocManager::install("plyranges")
}
require(plyranges)
if (!require(BSgenome.Hsapiens.UCSC.hg19))
{
  BiocManager::install("BSgenome.Hsapiens.UCSC.hg19")
}
library(BSgenome.Hsapiens.UCSC.hg19)
require(tidyverse)
seqlens <- seqlengths(BSgenome.Hsapiens.UCSC.hg19)
binbed <- snakemake@params[["random_regions"]] %>% read_bed()
#Functions (copied from DeepG4 package)
getScoreBW <- function (WIG, BED,forScan=F)
{
  res <- do.call("rbind",lapply(split(BED, droplevels(GenomeInfoDb::seqnames(BED))), function(zz) {
    cov <- WIG[[unique(as.character(GenomeInfoDb::seqnames(zz)))]]
    score <- IRanges::Views(cov, start = BiocGenerics::start(zz), end = BiocGenerics::end(zz))
    return(as.matrix(score))
  }))
  if(forScan){
    return(res)
  }else{
    return(rowMeans(res))
  }
}
NormBW <- function(x,binbed){
  ranges_bin <-  base::range(IRanges::subsetByOverlaps(x,binbed)$score,na.rm = TRUE, finite = TRUE)
  x$score <- scales::rescale(x$score,to = c(0,1),from = ranges_bin)
  x <- IRanges::coverage(x,weight = "score")
  return(x)
}


#read bed files
readBed <- . %>% read_tsv(col_names = F) %>% dplyr::select(1:3) %>% setNames(c("seqnames","start","end")) %>% as_granges()
G4pos.bed <- snakemake@input[["bed_pos"]] %>% read_bed()
if(unique(width(G4pos.bed)) != 201){
  G4pos.bed <- snakemake@input[["bed_pos"]] %>% readBed
}
G4neg.bed <- snakemake@input[["bed_ctrl"]] %>% read_bed()
if(unique(width(G4neg.bed)) != 201){
  G4neg.bed <- snakemake@input[["bed_ctrl"]] %>% readBed
}

G4pos.bed <- G4pos.bed %>% filter(seqnames != "chrY")
G4neg.bed <- G4neg.bed %>% filter(seqnames != "chrY")

#generate open chromatin dataset (ATAC or DNAse-seq)
my_seuil_bg <- snakemake@params[["seuil"]]
my_window_bg <- as.numeric(snakemake@params[["window"]])
ATAC_dataset <- snakemake@input[["atac_data"]]


G4all.bed <- c(G4pos.bed,G4neg.bed)
G4all.bed$order <- 1:length(G4all.bed)
G4all.bed <- BiocGenerics::sort(GenomeInfoDb::sortSeqlevels(G4all.bed))
G4all.atac <- ATAC_dataset %>% map(function(x){
  xbw <- x %>% import.bw %>% NormBW(binbed)
  res <- xbw %>% getScoreBW(G4all.bed)
  res[is.na(res)] <- 0
  return(res)
})%>% purrr::reduce(`+`) / length(ATAC_dataset)



G4all.bed.bg <- G4all.bed %>% anchor_center() %>% mutate(width=my_window_bg)%>%
  as_tibble() %>%
  left_join(enframe(seqlens),by = c("seqnames"="name")) %>%
  mutate(end = ifelse(end>value,value,end)) %>% dplyr::select(-width) %>% as_granges()

G4all.atac.bg <- ATAC_dataset %>% map(function(x){
  xbw <- x %>% import.bw() %>% NormBW(binbed)
  res <- xbw %>% getScoreBW(G4all.bed.bg)
  res[is.na(res)] <- 0
  return(res)
})%>% purrr::reduce(`+`) / length(ATAC_dataset)

my_test <- (G4all.atac/G4all.atac.bg)<my_seuil_bg
G4all.atac[my_test] <- 0


tibble(atac_seq=G4all.atac) %>% write_tsv(snakemake@output[["atac_merged"]],col_names = F)

MPRA GWAS Builder: snakemake workflow

  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
save.image("logs/intersect_epigenome.RData")


log <- file(snakemake@log[[1]], open="wt")
sink(log, type = "message")
sink(log, type = "output")

## Load packages
# library(SNPlocs.Hsapiens.dbSNP144.GRCh37)
# library(SNPlocs.Hsapiens.dbSNP151.GRCh38)
# library(BSgenome.Hsapiens.UCSC.hg19)
# library(TxDb.Hsapiens.UCSC.hg19.knownGene)
# library(VariantAnnotation)
# library(rtracklayer)
# library(plyranges)


## Set up project
# library(ProjectTemplate)
# load.project()

# str(snakemake@config$epigenome)

library(rtracklayer)
library(plyranges)
library(tidyverse)

set.seed(snakemake@config$seed)

## Load data
# ldlink_full_results <- read_tsv("./data/raw/lib3_design/ldlink_full_results.txt")
# haploreg_full_results <- read_tsv("./data/raw/lib3_design/haploreg_full_results.txt")

ld_snps <- read_tsv(snakemake@input$ld_snps)

hg19_to_hg38_chain <- import.chain("assets/hg19ToHg38.over.chain")

if ("epigenome_csv" %in% names(snakemake@config) && file.exists(snakemake@config$epigenome_csv)) {

    epigenome_csv <- read_csv(snakemake@config$epigenome_csv)
    epigenome_keys <- epigenome_csv$name
    epigenome_bed <- map2(epigenome_csv$bedfile, epigenome_csv$genome, function(bedfile, genome) {
        bed <- read_narrowpeaks(bedfile)
        if (genome == "hg19") {
            bed <- liftOver(bed, hg19_to_hg38_chain) %>% unlist
        }
        return(bed)
        })

} else {

    epigenome_keys <- names(snakemake@config$epigenome)
    epigenome_bed <- map(snakemake@config$epigenome, function(epigenome) {
        bed <- read_narrowpeaks(epigenome$bedfile)
        if (epigenome$genome == "hg19") {
            bed <- liftOver(bed, hg19_to_hg38_chain) %>% unlist
        }
        return(bed)
        })
}

epigenome_df <- tibble(key = epigenome_keys,
                       bed = epigenome_bed) %>%
    mutate(key = str_replace_all(key, "[^A-Za-z0-9_]", "_")) %>%
    group_by(key) %>%
    summarise(bed = list(reduce(bed, union_ranges)))

epigenome_keys <- epigenome_df$key
epigenome_bed <- epigenome_df$bed %>% set_names(epigenome_df$key)

ld_snps_gr <- ld_snps %>%
    filter(!is.na(coord_b38)) %>%
    extract(coord_b38, c("chr", "pos"), "(chr[0-9XY]+):(\\d+)", remove = F) %>%
    mutate(start = pos, end = pos) %>%
    select(-pos) %>%
    makeGRangesFromDataFrame(keep.extra.columns = T)

epigenome_ranges <- map(epigenome_bed,
    ~ as_tibble(.) %>%
    mutate(range = paste0(seqnames, ":", start, "-", end)) %>%
    pull(range))

mcols(ld_snps_gr) <- cbind(mcols(ld_snps_gr),
    map2_dfc(epigenome_ranges, epigenome_bed, ~ .x[findOverlaps(ld_snps_gr, .y, maxgap = 0, select = "first")]))


if (!is.null(snakemake@config$eqtls)) {
    eqtls <-
        map_dfr(snakemake@config$eqtls,
            ~ read_tsv(.$file), .id = "tissue")

    eqtls <- eqtls %>%
        extract(variant_id, c("chr", "pos"), "^(chr[0-9XY]+)_(\\d+)", remove = F) %>%
        mutate(pos = as.integer(pos))
} else {
    eqtls <- tibble(chr = character(), pos = integer(), variant_id = character())
}




ld_snps_epigenome <- ld_snps_gr %>%
    as_tibble(.name_repair = "minimal") %>%
    select(-end, -width, -strand) %>%
    dplyr::rename(chr = seqnames,
                  pos = start) %>%
    mutate(across(all_of(epigenome_keys), ~ ifelse(!is.na(.), cur_column(), NA), .names = "{.col}_dummy")) %>%
    unite(Epigenome, ends_with("_dummy"), sep = ";", na.rm = T) %>%
    left_join(eqtls %>% distinct(chr, pos, eQTL = variant_id))


write_tsv(ld_snps_epigenome, snakemake@output$epigenome)


peak_stats <- tibble(peakset = epigenome_keys, bed = epigenome_bed) %>%
    mutate(peak_num = map_int(epigenome_bed, length),
           peak_width = map_int(epigenome_bed, ~ sum(width(.)))) %>%
    select(-bed)

write_csv(peak_stats, snakemake@output$peak_stats)
data / bioconductor

BSgenome.Hsapiens.UCSC.hg19

Full genome sequences for Homo sapiens (UCSC version hg19, based on GRCh37.p13): Full genome sequences for Homo sapiens (Human) as provided by UCSC (hg19, based on GRCh37.p13) and stored in Biostrings objects.