Code Snippets

MPRA GWAS Builder: snakemake workflow

save.image("logs/clean_index_snps.RData")

log <- file(snakemake@log[[1]], open="wt")
sink(log, type = "message")
sink(log, type = "output")


library(SNPlocs.Hsapiens.dbSNP144.GRCh37)
library(SNPlocs.Hsapiens.dbSNP151.GRCh38)
library(XtraSNPlocs.Hsapiens.dbSNP141.GRCh38)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(magrittr)
library(tidyverse)

source("lib/helpers.R")


hg19_to_hg38_chain <- import.chain("assets/hg19ToHg38.over.chain")

# threads <- 4

# if (threads > 1) {
#     library(doMC)
#     registerDoMC(cores = threads)
#     do_parallel <- T
# } else {
#     do_parallel <- F
# }

index_snp_table <- read_tsv(snakemake@input$gwas,
                            col_types = cols(.default = col_character()), quote = "")
# index_snps <- read_tsv("./data/raw/lib3_design/skin_disease_index_snps.txt")

all(str_detect(index_snp_table$SNPS, "^rs\\d+$") |
        str_detect(index_snp_table$SNPS, "^chr[0-9XY]+:\\d+$"))


index_snps <- index_snp_table %>%
    select(disease = Disease, gwas_snp = SNPS, chr = CHR_ID, pos = CHR_POS,
           pubmed = PUBMEDID, sample = `INITIAL SAMPLE SIZE`) %>%
    mutate(coord_b38 = ifelse(is.na(chr), NA, paste0("chr", chr, ":", pos))) %>%
    mutate(coord_b38 = ifelse(is.na(coord_b38) & str_detect(gwas_snp, "chr.+:\\d+"), gwas_snp, coord_b38))


index_snps_gr <- index_snps %>%
    filter(!is.na(coord_b38)) %>%
    extract(coord_b38, c("chr", "pos"), "chr([0-9XY]+):([0-9]+)") %>%
    mutate(start = pos, end = pos) %>%
    makeGRangesFromDataFrame(keep.extra.columns = T)

snps_find_rsid_b37 <- snpsByOverlaps(SNPlocs.Hsapiens.dbSNP144.GRCh37, index_snps_gr)
snps_find_rsid_b38 <- snpsByOverlaps(SNPlocs.Hsapiens.dbSNP151.GRCh38, index_snps_gr)
snps_find_rsid_b38_xtra <- snpsByOverlaps(XtraSNPlocs.Hsapiens.dbSNP141.GRCh38,
                                          `seqlevelsStyle<-`(index_snps_gr, "dbSNP")) %>%
    `seqlevelsStyle<-`("NCBI")



snps_find_rsid_b37_tbl <-
   as.data.frame(snps_find_rsid_b37) %>%
    mutate(coord_b37 = paste0("chr", seqnames, ":", pos)) %>%
    select(rs_id_rescue_b37 = RefSNP_id, coord_b37)

snps_find_rsid_b38_tbl <-
    bind_rows(as.data.frame(snps_find_rsid_b38) %>%
                  mutate(coord_b38 = paste0("chr", seqnames, ":", pos)),
              as.data.frame(snps_find_rsid_b38_xtra) %>%
                  mutate(coord_b38 = paste0("chr", seqnames, ":", start))) %>%
    select(rs_id_rescue = RefSNP_id, coord_b38)
# snps_find_rsid_b38_tbl <- snps_find_rsid_b38 %>% as.data.frame() %>%
#     mutate(coord_b38 = paste0("chr", seqnames, ":", pos)) %>%
#     select(rs_id_rescue = RefSNP_id, coord_b38)

index_snps_cleaned <- left_join(index_snps, snps_find_rsid_b38_tbl) %>%
    mutate(index_snp = ifelse(str_detect(gwas_snp, "^rs\\d+"), gwas_snp,
                              ifelse(!is.na(rs_id_rescue), rs_id_rescue, NA))) %>%
    left_join(snps_find_rsid_b37_tbl, by = c("coord_b38" = "coord_b37")) %>%
    mutate(coord_b37 = ifelse(is.na(index_snp) & !is.na(rs_id_rescue_b37), coord_b38, NA),
           coord_b38 = ifelse(is.na(index_snp) & !is.na(rs_id_rescue_b37), NA, coord_b38),
           index_snp = ifelse(is.na(index_snp) & !is.na(rs_id_rescue_b37), rs_id_rescue_b37, index_snp)) %>%
    mutate(index_snp = ifelse(is.na(index_snp), gwas_snp, index_snp)) %>%
    select(disease, gwas_snp, index_snp, coord_b38, coord_b37, pubmed, sample)


write_csv(index_snps_cleaned, snakemake@output$index_snps)

R tidyverse SNPlocs.Hsapiens.dbSNP144.GRCh37 TxDb.Hsapiens.UCSC.hg38.knownGene magrittr doMC From line 4 of scripts/CleanIndexSNPs.R

readRenviron(".Renviron")

save.image("logs/get_snps_in_ld.RData")

log <- file(snakemake@log[[1]], open="wt")
sink(log, type = "message")
sink(log, type = "output")

if (! "haploR" %in% rownames(installed.packages())) {
    options(repos = list(CRAN="http://cran.rstudio.com/"))
    install.packages("haploR")
}

library(SNPlocs.Hsapiens.dbSNP144.GRCh37)
library(SNPlocs.Hsapiens.dbSNP151.GRCh38)
library(XtraSNPlocs.Hsapiens.dbSNP141.GRCh38)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)
library(LDlinkR)
library(haploR)
library(VariantAnnotation)
library(magrittr)
library(tidyverse)

source("lib/helpers.R")

set.seed(snakemake@config$seed)

hg19_to_hg38_chain <- import.chain("assets/hg19ToHg38.over.chain")

# threads <- 4

# if (threads > 1) {
#     library(doMC)
#     registerDoMC(cores = threads)
#     do_parallel <- T
# } else {
#     do_parallel <- F
# }

index_snps_cleaned <- read_csv(snakemake@input$index_snps)
# index_snps <- read_tsv("./data/raw/lib3_design/skin_disease_index_snps.txt")

r2_threshold <- snakemake@config$r2_threshold
r2_threshold_pop_specific <- snakemake@config$r2_threshold_pop_spec


pops <- snakemake@config$pops
# pops <- c("EUR", "AFR", "AMR", "EAS", "SAS", "ALL")

if (!is.null(snakemake@config$gwas_pop_key)) {
    gwas_pop_key <- read_tsv(snakemake@config$gwas_pop_key)

    sample_types <- c("individuals?",
                      "cases?",
                      "controls?",
                      "men",
                      "women",
                      "boys?",
                      "girls?",
                      "adults?",
                      "adolescents?",
                      "children and adolescents",
                      "children",
                      "infants?",
                      "neonates?",
                      "mothers?",
                      "fathers?",
                      "parents?",
                      "males?",
                      "females?",
                      "users?",
                      "non-users?",
                      "families",
                      "trios?",
                      "responders?",
                      "non-responders?",
                      "attempters?",
                      "nonattempters?",
                      "alcohol drinkers?",
                      "drinkers?",
                      "non-drinkers?",
                      "smokers?",
                      "non-smokers?",
                      "donors?",
                      "twin pairs?",
                      "twins?",
                      "child sibling pairs?",
                      "fetuses",
                      "offspring",
                      "early adolescents?",
                      "remitters?",
                      "non-remitters?",
                      "athletes?",
                      "Individuals?",
                      "indivduals?",
                      "triads?",
                      "patients?",
                      "pairs?",
                      "case-parent trios?",
                      "recipients?",
                      "affected child",
                      "long sleepers?",
                      "short sleepers?",
                      "unaffected relatives?",
                      "carriers?",
                      "non-carriers?",
                      "cell lines?",
                      "indiviudals?",
                      "referents?",
                      "individuuals?",
                      "duos?",
                      "indivdiuals?",
                      "inidividuals?")

    number_regex <- "(?:(?<=(?:\\s|\\b))\\d+(?:\\,\\d+)*(?=\\s))"
    type_regex <- paste0("(?:", paste0(sample_types, collapse = "|"), ")")


    full_regex <- paste0(
        "(", number_regex, ")", # greedy match first number
        "\\s*((?:(?!.*", type_regex, ").*)|(?:.*?))\\s*", # Greedy match rest if no sample type in lookahead, or passive match
        "(", type_regex,  "?(?!.*", type_regex, "))") # Match last sample type by ensuring no sample type in lookahead

    # split_regex <- "(?<!\\d)(,[\\s\\,]*| and )(?=[\\sA-Aa-z]*[0-9]+[,0-9]*[0-9]+\\s)"
    split_regex <- paste0("((?:,+[,\\s]*\\s+)|(?:and ))(?=[\\sA-Aa-z]*", number_regex, ")")


    sample_terms <- index_snps_cleaned %>%
        distinct(pubmed, sample) %>%
        mutate(split_sample = str_split(sample, split_regex)) %>%
        unnest(split_sample)


    full_matches <- bind_cols(sample_terms,
                              str_match(sample_terms$split_sample, full_regex) %>%
                                  set_colnames(c("match", "number", "capture", "type")) %>%
                                  as_tibble())

    study_key_table <- full_matches %>%
        distinct(pubmed, sample, split_sample, capture) %>%
        rename(term = capture) %>%
        left_join(gwas_pop_key) %>%
        filter(!is.na(code))

    index_snps_pop_match <- index_snps_cleaned %>%
        left_join(study_key_table) %>%
        distinct() %>%
        group_by(disease, gwas_snp, index_snp, coord_b38, coord_b37, pubmed, sample) %>%
        summarise(pops = paste0(sort(unique(unlist(str_split(code, ",")))), collapse = ",")) %>%
        ungroup()


    write_tsv(index_snps_pop_match, "outs/gwas_study_index_snps_matched_populations.tsv")

    index_snps_pop_match %>%
        group_by(disease, pubmed, sample, pops) %>%
        summarise(n_snps = n_distinct(index_snp, na.rm = T)) %>%
        write_tsv("outs/gwas_study_matched_populations.tsv")


} else {
    index_snps_pop_match <- tibble(disease = character(),
                                   pubmed = character(),
                                   sample = character(),
                                   index_snp = character(),
                                   pops = character())
}

max_pops <- snakemake@config$max_pops


index_snps_pop_match_filtered <- index_snps_pop_match %>%
    filter(!is.na(pops) & pops != "") %>%
    filter(map_lgl(str_split(pops, ","), ~ length(.) <= max_pops))


index_snps_pop_all <- crossing(index_snp = unique(index_snps_cleaned$index_snp),
                               pop = pops) %>%
    bind_rows(index_snps_pop_match_filtered %>%
                  mutate(pop = str_split(pops, ",")) %>%
                  unnest(pop) %>%
                  distinct(index_snp, pop))


snps_to_query <- index_snps_pop_all %>%
    filter(str_detect(index_snp, "rs\\d+"),
           !is.na(pop) & pop != "") %>%
    mutate(r2_threshold = ifelse(is.null(r2_threshold_pop_specific) | pop == "ALL",
                                 r2_threshold, r2_threshold_pop_specific))

out_dir <- "outs/SNPS_LDlink"

dir.create(out_dir, showWarnings = F, recursive = T)

ldlink_results <- snps_to_query %>%
    mutate(ldlink_results = pmap(list(index_snp, pop, r2_threshold),
        ~ query_ldlink(snp = ..1, pop = ..2, r2 = ..3, out_dir = out_dir, retry_errors = snakemake@config$retry_errors)))

ldlink_results_table <- ldlink_results %>%
    unnest(ldlink_results) %>%
    filter(R2 >= r2_threshold)

write_tsv(ldlink_results_table, "outs/ldlink_full_results.txt")

haploreg_pops <- c("AFR" = "AFR",
                   "AMR" = "AMR",
                   "EAS" = "ASN",
                   "EUR" = "EUR",
                   "SAS" = "ASN")

out_dir_haploreg <- "outs/SNPS_HaploReg"
dir.create(out_dir_haploreg, showWarnings = F, recursive = T)

haploreg_results <- snps_to_query %>%
    filter(pop %in% names(haploreg_pops)) %>%
    mutate(pop = haploreg_pops[pop]) %>%
    group_by(pop, r2_threshold) %>%
    summarise(index_snps = list(sort(index_snp))) %>%
    mutate(haploreg_results = pmap(list(index_snps, pop, r2_threshold),
        ~ query_haploreg(snps = ..1, pop = ..2, r2 = ..3,
                        force = T, out_dir = out_dir_haploreg))) %>%
    ungroup()

if (nrow(haploreg_results) > 0) {
    haploreg_results_table <- haploreg_results %>%
        select(pop, r2_threshold, haploreg_results) %>%
        unnest(haploreg_results) %>%
        select(index_snp = query_snp_rsid, everything()) %>%
        filter(r2 >= r2_threshold)
} else {
    haploreg_results_table <- tibble(
        index_snp = character(),
        pop = character(),
        chr = character(),
        pos_hg38 = character(),
        r2 = double(),
        D = double(),
        is_query_snp = double(),
        rsID = character(),
        ref = character(),
        alt = character()
    )
}

write_tsv(haploreg_results_table, "outs/haploreg_full_results.txt")



# Harmonize rsIDs and genomic coordinates for all LD SNPs from both sources

# ldlink_results_table <- read_tsv("./data/raw/lib3_design/ldlink_full_results.txt")
# haploreg_results_table <- read_tsv("./data/raw/lib3_design/haploreg_full_results.txt")

## LDlink data is in hg19 coordinates
ldlink_snps <- ldlink_results_table %>%
    extract(Alleles, c("ref", "alt"), "([ACGT-]+)\\/([ACGT-]+)", remove = F) %>%
    filter(!is.na(ref), !is.na(alt))

ldlink_snps_b38 <- ldlink_snps %>%
    extract(Coord, c("chr", "start"), "(chr[0-9XY]+):(\\d+)", remove = F) %>%
    mutate(end = start) %>%
    select(seqnames = chr, start, end, snp = RS_Number, index_snp, coord_b37 = Coord, ref, alt) %>%
    makeGRangesFromDataFrame(keep.extra.columns = T) %>%
    liftOver(hg19_to_hg38_chain) %>% unlist %>%
    as_tibble() %>%
    mutate(coord_b38 = paste0(seqnames, ":", start),
           snp = ifelse(is.na(snp) | !str_detect(snp, "^rs\\d+"), coord_b38, snp)) %>%
    select(snp, coord_b38, ref, alt, index_snp, coord_b37) %>%
    distinct()



## HaploReg data is in hg38 coordinates, but not all snps returned have genome coordinates

haploreg_snps <- haploreg_results_table %>%
    mutate(coord_b38 = ifelse(is.na(chr), NA, paste0("chr", chr, ":", pos_hg38))) %>%
    select(snp = rsID, coord_b38, ref, alt, index_snp)

haploreg_snps_no_coord <- haploreg_snps %>%
    filter(is.na(coord_b38)) %>% pull(snp) %>% unique()

## Try to rescue location data from SNPlocs packages and GTEx variant info

haploreg_snps_find_locs_b38 <- snpsById(SNPlocs.Hsapiens.dbSNP151.GRCh38, haploreg_snps_no_coord, ifnotfound = "drop") %>%
    GRanges() %>% as_tibble() %>%
    mutate(chr = str_replace(as.character(seqnames), "^(chr|ch)", "")) %>%    select(chr, pos_b38 = start, snp = RefSNP_id)
haploreg_snps_find_locs_b38_xtra <- snpsById(XtraSNPlocs.Hsapiens.dbSNP141.GRCh38, haploreg_snps_no_coord, ifnotfound = "drop") %>%
    GRanges() %>% as_tibble() %>%
    mutate(chr = str_replace(as.character(seqnames), "^(chr|ch)", "")) %>%
    select(chr, pos_b38 = start, snp = RefSNP_id)


if (!is.null(snakemake@config$gtex_table)) {
    gtex_var_map <- read_tsv(snakemake@config$gtex_table,
                             col_types = "c-----cc") %>%
        dplyr::rename(rs_id = "rs_id_dbSNP151_GRCh38p7")

    haploreg_snps_find_locs_gtex <- gtex_var_map %>% filter(rs_id %in% haploreg_snps_no_coord) %>%
        extract(variant_id, c("chr", "pos_b38"), "^chr([0-9XY]+)_(\\d+)") %>%
        mutate(pos_b38 = as.numeric(pos_b38)) %>%
        select(chr, pos_b38, snp = rs_id)
} else {
    haploreg_snps_find_locs_gtex <- tibble()
}


haploreg_snps_find_locs_combined <- bind_rows(
    haploreg_snps_find_locs_b38,
    haploreg_snps_find_locs_b38_xtra,
    haploreg_snps_find_locs_gtex
) %>% distinct %>%
    mutate(coord_b38_rescue = paste0("chr", chr, ":", pos_b38)) %>%
    select(snp, coord_b38_rescue)


haploreg_snps_b38 <- haploreg_snps %>%
    left_join(haploreg_snps_find_locs_combined) %>%
    mutate(coord_b38 = as.character(ifelse(is.na(coord_b38), coord_b38_rescue, coord_b38))) %>%
    select(-coord_b38_rescue) %>%
    distinct()



## Combine LD SNPs


ld_snps_b38 <- bind_rows(
    ldlink_snps_b38 %>% mutate(source = "LDlink"),
    haploreg_snps_b38 %>% mutate(source = "HaploReg")
)



## Get TxDb annotations

ld_snps_b38_gr <- ld_snps_b38 %>%
    extract(coord_b38, c("seqnames", "start"), "(.+):(\\d+)") %>%
    filter(!is.na(seqnames), !is.na(start)) %>%
    mutate(end = start) %>%
    select(seqnames, start, end, snp) %>%
    makeGRangesFromDataFrame(keep.extra.columns = T)

ld_snps_txdb_loc <- locateVariants(ld_snps_b38_gr, TxDb.Hsapiens.UCSC.hg38.knownGene, AllVariants())

ld_snps_txdb_loc_df <- as_tibble(ld_snps_txdb_loc) %>%
    transmute(coord_b38 = paste0(seqnames, ":", start),
              txdb_annot = LOCATION) %>%
    distinct() %>%
    group_by(coord_b38) %>%
    summarise(txdb_annot = paste0(txdb_annot, collapse = ";"))

ld_snps_b38_annot <- left_join(ld_snps_b38,
                               ld_snps_txdb_loc_df, by = "coord_b38")


write_tsv(ld_snps_b38_annot, snakemake@output$ld_snps)

R tidyverse SNPlocs.Hsapiens.dbSNP144.GRCh37 TxDb.Hsapiens.UCSC.hg38.knownGene magrittr ucsc-liftover Greedy algorithm for Set Cover problem VariantAnnotation doMC LDlinkR haploR From line 3 of scripts/GetSNPsInLD.R

A snakemake workflow to process ATAC-seq data

myargs <- commandArgs(trailingOnly=TRUE)
bamfile <- myargs[1]
species <- myargs[2]

print("loading packages (ATACseqQC, ggplot, etc)...")
suppressPackageStartupMessages(library(ggplot2, quietly=TRUE))
suppressPackageStartupMessages(library(Rsamtools, quietly=TRUE))
suppressPackageStartupMessages(library(ATACseqQC, quietly=TRUE))
suppressPackageStartupMessages(library(ChIPpeakAnno, quietly=TRUE))
suppressPackageStartupMessages(library("GenomicAlignments", quietly=TRUE))

if (species == "mm") {
  suppressPackageStartupMessages(library(TxDb.Mmusculus.UCSC.mm10.knownGene, quietly=TRUE))
  suppressPackageStartupMessages(library(BSgenome.Mmusculus.UCSC.mm10, quietly=TRUE))
  txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene
  bsgenome <- BSgenome.Mmusculus.UCSC.mm10
  genome <- Mmusculus
  print("species is 'mm' using mm10 for analysis")
  ### Note: Everything below is deprecated until I can figure out a way to port a 
  ### static/local package with snakemake
  # Note: phastCons60way was manually curated from GenomicAlignments, built, and installed as an R package
  # score was obtained according to: https://support.bioconductor.org/p/96226/
  # package was built and installed according to: https://www.bioconductor.org/packages/devel/bioc/vignettes/GenomicScores/inst/doc/GenomicScores.html
  # (section 5.1: Building an annotation package from a GScores object)
  #suppressWarnings(suppressPackageStartupMessages(library(GenomicScores, lib.loc="/users/dia6sx/snakeATAC/scripts/", quietly=TRUE)))
  #suppressWarnings(suppressPackageStartupMessages(library(phastCons60way.UCSC.mm10, lib.loc="/users/dia6sx/snakeATAC/scripts/", quietly=TRUE)))
} else if (species == "hs") {
  suppressPackageStartupMessages(library(TxDb.Hsapiens.UCSC.hg38.knownGene, quietly=TRUE))
  suppressPackageStartupMessages(library(BSgenome.Hsapiens.UCSC.hg38, quietly=TRUE))
  txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
  bsgenome <- BSgenome.Hsapiens.UCSC.hg38
  genome <- Hsapiens
  print("species is 'hs' using hg38 for analysis")
} else {
  print(paste("params ERROR: ATACseqQC is not configured to use species =", species))
  print("exiting...")
  quit(status=1)
}

doATACseqQC <- function(bamfile, txdb, bsgenome, genome) {
    # Fragment size distribution
    print(paste("generating output for ",strsplit(basename(bamfile),split='\\.')[[1]][1],"...",sep=""))
    print("calculating Fragment size distribution...")
    bamfile.labels <- gsub(".bam", "", basename(bamfile))
    loc_to_save_figures <- paste(dirname(dirname(bamfile)),"/qc/ATACseqQC",sep="")
    if (file.exists(loc_to_save_figures)) {
        print("Warning: old figures will be overwritten")
    } else {
        dir.create(loc_to_save_figures)
    }
    png_file <- paste(loc_to_save_figures,"/",bamfile.labels,"_fragment_size_distribution.png",sep="")
    png(png_file)
    fragSizeDist(bamfile, bamfile.labels)
    dev.off()

    # Adjust the read start sites
    print("adjusting read start sites...")
    ## bamfile tags to be read in
    possibleTag <- list("integer"=c("AM", "AS", "CM", "CP", "FI", "H0", "H1", "H2", 
                                    "HI", "IH", "MQ", "NH", "NM", "OP", "PQ", "SM",
                                    "TC", "UQ"), 
                    "character"=c("BC", "BQ", "BZ", "CB", "CC", "CO", "CQ", "CR",
                                "CS", "CT", "CY", "E2", "FS", "LB", "MC", "MD",
                                "MI", "OA", "OC", "OQ", "OX", "PG", "PT", "PU",
                                "Q2", "QT", "QX", "R2", "RG", "RX", "SA", "TS",
                                "U2"))
    bamTop100 <- scanBam(BamFile(bamfile, yieldSize = 100),
                     param = ScanBamParam(tag=unlist(possibleTag)))[[1]]$tag
    tags <- names(bamTop100)[lengths(bamTop100)>0]
    ## files will be output into outPath
    ## shift the coordinates of 5'ends of alignments in the bam file
    outPath <- paste(dirname(dirname(bamfile)),"/alignments_shifted", sep="")
    seqinformation <- seqinfo(txdb)
    gal <- readBamFile(bamfile, tag=tags, asMates=TRUE, bigFile=TRUE)
    shiftedBamfile <- file.path(outPath, paste(bamfile.labels,"_shifted.bam",sep=""))
    # check if shifted Bam file exists from previous run
    if (file.exists(shiftedBamfile)) {
        print("Shifted Bamfile found.")
        print("Loading in...")
        gal <- readBamFile(shiftedBamfile, tag=tags, asMates=TRUE, bigFile=TRUE)
        ## This step is mostly for formating so splitBam can
        ## take in bamfile. Implementing shift of 0 bp on positive strand
        ## and 0 bp on negative strand because shifted Bamfile should
        ## already have these shifts
        gal1 <- shiftGAlignmentsList(gal, positive = 0L, negative = 0L)
    } else {
        # shifted bam file does not exist check if
        # old shifted alignments directory exists
        # if so remove and create new one
        if (file.exists(outPath)){
            unlink(outPath,recursive=TRUE)
        }
        dir.create(outPath)
        print("*** creating shifted bam file ***")
        gal1 <- shiftGAlignmentsList(gal, outbam=shiftedBamfile)
    }

    # Promoter/Transcript body (PT) score
    print("calculating Promoter/Transcript body (PT) score...")
    txs <- transcripts(txdb)
    pt <- PTscore(gal1, txs)
    png_file <- paste(loc_to_save_figures,"/",bamfile.labels,"_ptscore.png",sep="")
    png(png_file)
    plot(pt$log2meanCoverage, pt$PT_score, 
        xlab="log2 mean coverage",
        ylab="Promoter vs Transcript",
        main=paste(bamfile.labels,"PT score"))
    dev.off()

    # Nucleosome Free Regions (NFR) score
    print("calculating Nucleosome Free Regions (NFR) score")
    nfr <- NFRscore(gal1, txs)
    png_file <- paste(loc_to_save_figures,"/",bamfile.labels,"_nfrscore.png",sep="")
    png(png_file)
    plot(nfr$log2meanCoverage, nfr$NFR_score, 
        xlab="log2 mean coverage",
        ylab="Nucleosome Free Regions score",
        main=paste(bamfile.labels,"\n","NFRscore for 200bp flanking TSSs",sep=""),
        xlim=c(-10, 0), ylim=c(-5, 5))
    dev.off()

    # Transcription Start Site (TSS) Enrichment Score
    print("calculating Transcription Start Site (TSS) Enrichment score")
    tsse <- TSSEscore(gal1, txs)
    png_file <- paste(loc_to_save_figures,"/",bamfile.labels,"_tss_enrichment_score.png",sep="")
    png(png_file)
    plot(100*(-9:10-.5), tsse$values, type="b", 
        xlab="distance to TSS",
        ylab="aggregate TSS score",
        main=paste(bamfile.labels,"\n","TSS Enrichment score",sep=""))
    dev.off()

    # Split reads, Heatmap and coverage curve for nucleosome positions
    print("splitting reads by fragment length...")
    genome <- genome
    outPath <- paste(dirname(dirname(bamfile)),"/alignments_split", sep="")
    TSS <- promoters(txs, upstream=0, downstream=1)
    TSS <- unique(TSS)
    ## estimate the library size for normalization
    librarySize <- estLibSize(bamfile)
    ## calculate the signals around TSSs.
    NTILE <- 101
    dws <- ups <- 1010
    splitBamfiles <- paste(outPath,"/",c("NucleosomeFree", 
                                             "mononucleosome",
                                             "dinucleosome",
                                             "trinucleosome"),".bam",sep="")
    # check if split Bam files exists from previous run
    if (all(file.exists(splitBamfiles))) {
        print("*** split bam files found! ***")
        print("Loading in...")
        sigs <- enrichedFragments(bamfiles=splitBamfiles,
                                    index=splitBamfiles, 
                                    TSS=TSS,
                                    librarySize=librarySize,
                                    TSS.filter=0.5,
                                    n.tile = NTILE,
                                    upstream = ups,
                                    downstream = dws)
    } else {
        # split bam files do not exist check if
        # old split alignments directory exists
        # if so remove and create new one
        if (file.exists(outPath)){
            unlink(outPath,recursive=TRUE)
        }
        print("*** creating split bam files ***")
        dir.create(outPath)
        ## split the reads into NucleosomeFree, mononucleosome, 
        ## dinucleosome and trinucleosome.
        ## and save the binned alignments into bam files.
        objs <- splitGAlignmentsByCut(gal1, txs=txs, genome=genome, outPath = outPath)
        #objs <- splitBam(bamfile, tags=tags, outPath=outPath,
            #        txs=txs, genome=genome,
            #       conservation=phastCons60way.UCSC.mm10,
            #      seqlev=paste0("chr", c(1:19, "X", "Y")))
        sigs <- enrichedFragments(gal=objs[c("NucleosomeFree", 
                                        "mononucleosome",
                                        "dinucleosome",
                                        "trinucleosome")], 
                                    TSS=TSS,
                                    librarySize=librarySize,
                                    TSS.filter=0.5,
                                    n.tile = NTILE,
                                    upstream = ups,
                                    downstream = dws)
    }
    ## log2 transformed signals
    sigs.log2 <- lapply(sigs, function(.ele) log2(.ele+1))
    ## plot heatmap
    png_file <- paste(loc_to_save_figures,"/",bamfile.labels,"_nucleosome_pos_heatmap.png",sep="")
    png(png_file)
    featureAlignedHeatmap(sigs.log2, reCenterPeaks(TSS, width=ups+dws),
                        zeroAt=.5, n.tile=NTILE)
    dev.off()
    ## get signals normalized for nucleosome-free and nucleosome-bound regions.
    out <- featureAlignedDistribution(sigs, 
                                    reCenterPeaks(TSS, width=ups+dws),
                                    zeroAt=.5, n.tile=NTILE, type="l", 
                                    ylab="Averaged coverage")
    ## rescale the nucleosome-free and nucleosome signals to 0~1
    range01 <- function(x){(x-min(x))/(max(x)-min(x))}
    out <- apply(out, 2, range01)
    png_file <- paste(loc_to_save_figures,"/",bamfile.labels,"_TSS_signal_distribution.png",sep="")
    png(png_file)
    matplot(out, type="l", xaxt="n", 
            xlab="Position (bp)", 
            ylab="Fraction of signal",
            main=paste(bamfile.labels,"\n","TSS signal distribution",sep=""))
    axis(1, at=seq(0, 100, by=10)+1, 
        labels=c("-1K", seq(-800, 800, by=200), "1K"), las=2)
    abline(v=seq(0, 100, by=10)+1, lty=2, col="gray")
    dev.off()

    print("QC Finished.")
    print("Generated QC figures can be found in qc folder under ATACseQC")
    print(paste("*** removing temp files in",outPath,"***"))
    unlink(outPath,recursive=TRUE)
    outPath <- paste(dirname(dirname(bamfile)),"/alignments_shifted", sep="")
    print(paste("*** removing temp files in",outPath,"***"))
    unlink(outPath,recursive=TRUE)
}

doATACseqQC(bamfile, txdb, bsgenome, genome)

R Snakemake ggplot2 BSgenome.Hsapiens.UCSC.hg38 BSgenome.Mmusculus.UCSC.mm10 TxDb.Mmusculus.UCSC.mm10.knownGene TxDb.Hsapiens.UCSC.hg38.knownGene GenomicAlignments Rsamtools ATACseqQC GenomicScores From line 23 of scripts/doATACseqQC.R

Snakemake pipeline for Epicure analyses (0.14.1)

base::message("Loading libraries ... ")
suppressPackageStartupMessages(library("ATACseqQC"))
suppressPackageStartupMessages(library("TxDb.Hsapiens.UCSC.hg38.knownGene"))
suppressPackageStartupMessages(library("BSgenome.Hsapiens.UCSC.hg38"))
suppressPackageStartupMessages(library("phastCons100way.UCSC.hg38"))
suppressPackageStartupMessages(library("MotifDb"))
suppressPackageStartupMessages(library("ChIPpeakAnno"))
suppressPackageStartupMessages(library("Rsamtools"))
base::message("Libraries loaded.")

# base::message("Setting sequence level style...")
# seqlevelsStyle(TxDb.Hsapiens.UCSC.hg38.knownGene) <- "Ensembl"
# seqlevelsStyle(BSgenome.Hsapiens.UCSC.hg38) <- "Ensembl"
# base::message("Database chromosome renamed.")

base::message("Acquiering bam file...")
bamfile <- BamFile(
    file = base::as.character(x = snakemake@input[["bam"]])
)
name <- base::as.character(x = snakemake@params[["name"]])
base::print(bamfile)
base::print(name)
base::message("BamFiles identified")

base::message("Reading bam tags...")
possibleTag <- list(
    "integer" = c(
        "AM", "AS", "CM", "CP", "FI", "H0", "H1",
        "H2", "HI", "IH", "MQ", "NH", "NM", "OP",
        "PQ", "SM", "TC", "UQ"
    ),
    "character" = c(
        "BC", "BQ", "BZ", "CB", "CC", "CO", "CQ", "CR",
        "CS", "CT", "CY", "E2", "FS", "LB", "MC", "MD",
        "MI", "OA", "OC", "OQ", "OX", "PG", "PT", "PU",
        "Q2", "QT", "QX", "R2", "RG", "RX", "SA", "TS", "U2"
    )
)
bamTop100 <- scanBam(
    BamFile(bamfile$path, yieldSize = 100),
    param = ScanBamParam(tag=unlist(possibleTag))
)[[1]]$tag
tags <- names(bamTop100)[lengths(bamTop100) > 0]
base::print(tags)
base::message("Tags Acquired")

base::message("Retrieving sequence level informations...")
seqlev <- as.vector(
    sapply(c(1:22, "X", "Y"), function(chrom) paste0("chr", chrom))
)
seqinformation <- seqinfo(TxDb.Hsapiens.UCSC.hg38.knownGene)
which <- as(seqinformation[seqlev], "GRanges")
base::print(which)
base::message("Sequences retrived")

base::message("Loading bam file...")
bamdata <- readBamFile(
    bamFile = bamfile$path,
    bigFile = TRUE,
    asMates = TRUE,
    tags = tags,
    which = which,
)
base::message("Bam file loaded")

base::message("Shifting bam...")
shiftedBamfile <- base::as.character(x = snakemake@output[["shifted"]])
shiftedBamdir <- base::dirname(shiftedBamfile)
print(shiftedBamdir)
base::dir.create(
    path = shiftedBamdir,
    recursive = TRUE
)
bamdata <- shiftGAlignmentsList(
    gal = bamdata,
    outbam = shiftedBamfile
)
print(bamdata)
base::message("Shift OK")

base::message("Acquiering motif...")
motif_name <- base::as.character(x = snakemake@params[["motif"]])
motif <- query(MotifDb, c(motif_name))
motif <- as.list(motif)
print(motif[[1]], digits = 2)
base::message("Motif retrieved.")

base::message("plot Footprints...")
genome <- Hsapiens
print(genome)

png(
    filename = snakemake@output[["png"]],
    width = 1024,
    height = 768,
    units = "px"
)
sigs <- factorFootprints(
    shiftedBamfile,
    pfm = motif[[1]],
    genome = genome,
    min.score = "90%",
    seqlev = c(1:22, "X", "Y"),
    upstream = 100,
    downstream = 100
)
dev.off()
base::message("Done.")

base::save.image(file = base::as.character(x = snakemake@output[["rda"]]))
base::message("Process over")

R BSgenome.Hsapiens.UCSC.hg38 TxDb.Hsapiens.UCSC.hg38.knownGene Rsamtools ChIPpeakAnno ATACseqQC MotifDb From line 1 of factorfootprints/factor_footprints.R

A Snakemake workflow to analyse and visualise Illumina Infinium Methylation arrays

addAnno <- function(dmrs, outputLoc = "nearestLocation", featureLocForDistance="TSS", 
                    bindingRegion=c(-2000, 2000), organism = "hg38"){

    library(GenomicRanges)
    library(ChIPpeakAnno)
    library(org.Hs.eg.db)

    dmrs = GRanges(dmrs)

    if(organism == "hg38"){   

        library(TxDb.Hsapiens.UCSC.hg38.knownGene)

        annoData <- toGRanges(TxDb.Hsapiens.UCSC.hg38.knownGene)

    } 

    if(organism == "hg19"){

        library(TxDb.Hsapiens.UCSC.hg19.knownGene)

        annoData <- toGRanges(TxDb.Hsapiens.UCSC.hg19.knownGene)

    }

    seqlevelsStyle(dmrs) <- seqlevelsStyle(annoData)

    anno_dmrs <- annotatePeakInBatch(dmrs, AnnotationData = annoData, 
                                    output = outputLoc, 
                                    FeatureLocForDistance = featureLocForDistance,
                                    bindingRegion = bindingRegion)

    anno_dmrs$symbol <- xget(anno_dmrs$feature, org.Hs.egSYMBOL)

    return(anno_dmrs)

}

main <- function(input, output, params, log) {

    # Log

    out <- file(log$out, open = "wt")

    err <- file(log$err, open = "wt")

    sink(out, type = "output")

    sink(err, type = "message")

    # Script

    library(minfi)
    library(DMRcate)
    library(rtracklayer)

    dmrs <- readRDS(input$rds)

    # params
    outputLoc <- params$output # "nearestLocation"
    featureLocForDistance <- params$featureLocForDistance # "TSS"
    bindingRegion <- params$bindingRegion  # c(-2000, 2000)
    organism <- params$organism

    # output 
    save <- output$csv

    # run annotation
    dmrs = addAnno(dmrs, outputLoc, featureLocForDistance, bindingRegion, organism)

    # save output

    write.csv(as.data.frame(dmrs), save)

    rtracklayer::export(dmrs, output$bed) 

    saveRDS(dmrs, file = output$rds)

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)

R org.Hs.eg.db TxDb.Hsapiens.UCSC.hg19.knownGene TxDb.Hsapiens.UCSC.hg38.knownGene GenomicRanges minfi DMRcate From line 3 of scripts/annotate.R

getTrackObj <- function(filter, anno = "hg38", array = "HM450", combine = "mean", by = "status"){

  library(TxDb.Hsapiens.UCSC.hg38.knownGene)

  txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene

  manifest <- readRDS(paste0("resources/", array, ".", anno, ".manifest.rds"))

  # only keep cpgs which are in final filtered Genomic Ratio set
  keep <- names(manifest) %in% featureNames(filter)

  manifest <- manifest[keep,]

  # Get Beta table from GRSet

  beta <- getBeta(filter)

  # match order of beta table with manifest meta data

  beta <- beta[match(names(manifest), rownames(beta)),]

  # check rownames equal

  stopifnot(rownames(beta) == names(manifest))

  # Add new colnames to beta table matching colData

  stopifnot(colnames(beta) == rownames(colData(filter)))

  colnames(beta) <- colData(filter)$Sample_Name

  # Add beta signal to tracks GRanges mcols instead of other data

  tracks <- manifest 

  mcols(tracks) <- beta

  # Turn into list of separate GRanges objects

  if (is.null(combine)) {

    tracksList <- lapply(colnames(mcols(tracks)), function(x, tracks){tracks[, colnames(mcols(tracks)) == x] } , tracks = tracks)

    names(tracksList) = colnames(mcols(tracks))

    tracksList <- lapply(tracksList, filterTrackOverlaps)

  } else {

    combine_by <- unique(colData(filter)[[by]])

    tracksList <- lapply(combine_by, combineBeta, tracks = tracks, colData=colData(filter), by = by, combine = combine)

    names(tracksList) = combine_by

  }

  return(tracksList)
}

# Combine samples together based on colData col name and label into combined tracks

combineBeta <- function(label, tracks, colData, by, combine = "mean", samplename = "Sample_Name"){

  library(TxDb.Hsapiens.UCSC.hg38.knownGene)

  txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene

  samples_int <- colData[colData[[by]] %in% label,][[samplename]]

  if(combine == "mean"){

    combineMeta <- rowMeans(as.data.frame(mcols(tracks[, colnames(mcols(tracks)) %in% samples_int])))

  } 

  if(combine == "median"){

    combineMeta <- rowMedians(as.data.frame(mcols(tracks[, colnames(mcols(tracks)) %in% samples_int])))

  }

  tracks_new <- tracks

  mcols(tracks_new) <- combineMeta

  tracks_new <- filterTrackOverlaps(tracks_new)

  return(tracks_new)

}

# Parse GRranges object to ensure ready for output
# Ensures Beta col is labelled score
# Removes indeterminate chrs (*)
# Removes Cpg sites which overlap boundaries - this should not be the case with any CpG GRanges obj but rtracklayer also does not want them to share boundaries

filterTrackOverlaps <- function(tracks){

    library(TxDb.Hsapiens.UCSC.hg38.knownGene)

    txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene

    colnames(mcols(tracks)) <- "score"

    seqlevelsStyle(tracks) <- "UCSC"

    seqlevels(tracks) <- seqnames(seqinfo(tracks))[seqnames(seqinfo(tracks)) != "*"]

    seqinfo(tracks) <- seqinfo(txdb)[seqnames(seqinfo(tracks))[seqnames(seqinfo(tracks)) != "*"]]

    # take tracks which share boundaries and remove them
    tracks <- tracks[!tail(start(tracks), -1) <= head(end(tracks), -1)]

    # resort
    tracks <- sort(tracks)

    return(tracks)

}

# Save out track via rtracklayer

saveTrack <- function(sample, tracks, fileExt = ".BigWig", location = "./"){

  library(rtracklayer)

  track <- tracks[sample][[1]]

  rtracklayer::export(track, paste0(location, sample, fileExt))  

}

main <- function(input, output, params, log) {

    # Log

    out <- file(log$out, open = "wt")

    err <- file(log$err, open = "wt")

    sink(out, type = "output")

    sink(err, type = "message")

    # Script

    library(minfi)
    library(DMRcate)
    library(rtracklayer)

    filter <- readRDS(input$rds)

    # params
    anno <- params$anno # "hg38", "hg19"
    array <- params$array # "EPIC", "HM450"
    combine <- params$combine  # "mean", "median"
    by <- params$by # "sample"

    # output 
    save <- output$rds

    # run annotation
    tracks = getTrackObj(filter, anno, array, combine, by)

    # save output
    # Bigwig
    lapply(names(tracks), saveTrack, track = tracks, fileExt = ".BigWig", location = params$bwLocation )

    # Bedgraph
    lapply(names(tracks), saveTrack, track = tracks, fileExt = ".bedGraph", location = params$bwLocation)

    # save out list of GRanges 
    saveRDS(tracks, file = output$rds)

}

main(snakemake@input, snakemake@output, snakemake@params, snakemake@log)

R TxDb.Hsapiens.UCSC.hg38.knownGene rtracklayer minfi DMRcate From line 3 of scripts/tracks.R

Workflow Steps and Code Snippets

MPRA GWAS Builder: snakemake workflow

A snakemake workflow to process ATAC-seq data

Snakemake pipeline for Epicure analyses (0.14.1)

A Snakemake workflow to analyse and visualise Illumina Infinium Methylation arrays

TxDb.Hsapiens.UCSC.hg38.knownGene