Code Snippets

cfMeDIP-seq Circulating Methylome Data Post-processing Pipeline

' run_medestrand.R
Run MeDEStrand.

Usage:
    run_medestrand.R -b BAM -o OUTPUT -p PAIRED [ -m MEDESTRAND ]

Options:
    -b --bam BAM                Path to input BAM file
    -o --output OUTPUT          Output path (RDS file)
    -p --paired PAIRED          Sample is paired end or single end sqeuncing based on cohort
    -m --medestrand MEDESTRAND  Path to MeDEStrand Package
' -> doc

if (! interactive()) {
    library(docopt)
    args <- docopt(doc, version='Run MeDEStrand v 1.0')
    print(args)
} else {
    message('Running in interactive mode. Be sure to specify args manually.')
}

if (is.null(args[['medestrand']])) {
    library(MeDEStrand)
} else {
    devtools::load_all(args[['medestrand']])
}
library(GenomicRanges)
library(MEDIPS)
library(BSgenome.Hsapiens.UCSC.hg38)
library(tidyverse)


BIN_WIDTH = 300
allmainchrs = paste0('chr', c(1:22))
BSgenome = 'BSgenome.Hsapiens.UCSC.hg38'
paired_val = (args[['paired']] == "True")

methylset <- MeDEStrand.createSet(
    file = args[['bam']],
    BSgenome = BSgenome,
    uniq = 1,
    extend = 0,
    shift = 0,
    window_size = BIN_WIDTH,
    chr.select = allmainchrs,
    paired = paired_val
)

CS = MeDEStrand.countCG(pattern='CG', refObj=methylset)

absolute_methylation <- MeDEStrand.binMethyl(MSetInput = methylset, CSet = CS, Granges = FALSE)

MSet = methylset[[1]]
chr.select = MSet@chr_names
window_size = window_size(MSet)
chr_lengths = unname( seqlengths(BSgenome.Hsapiens.UCSC.hg38)[ seqnames(BSgenome.Hsapiens.UCSC.hg38@seqinfo)%in%chr.select ] )
no_chr_windows = ceiling(chr_lengths/window_size)
supersize_chr = cumsum(no_chr_windows)
chromosomes=chr.select

all.Granges.genomeVec = MEDIPS.GenomicCoordinates(supersize_chr, no_chr_windows, chromosomes, chr_lengths, window_size)
all.Granges.genomeVec$CF = CS@genome_CF
all.Granges.genomeVec$binMethyl= absolute_methylation

absolute_methylation_df <- as.data.frame(all.Granges.genomeVec)
colnames(absolute_methylation_df) <- c("bin_chr","bin_start","bin_end","bin_width","strand","cpg_count","bin_methyl")
absolute_methylation_df = absolute_methylation_df[, c("bin_chr","bin_start","bin_end","cpg_count","bin_methyl")]

write_tsv(absolute_methylation_df, file = args[['output']], col_names = TRUE)

rm(list = ls())
gc()

R tidyverse BSgenome.Hsapiens.UCSC.hg38 GenomicRanges docopt MEDIPS From line 1 of R/run_medestrand.R

' run_MEDIPS.R
Run MEDIPS for counts and conduct MEDIPS QC.

Usage:
    run_MEDIPS.R -b BAM -o OUTPUT -q QCOUT -p PAIRED

Options:
    -b --bam BAM                Path to input BAM file
    -o --output OUTPUT          Output path (RDS file)
    -q --qcout QCOUT            Path to output QC results of sample
    -p --paired PAIRED          Sample is paired end or single end sqeuncing based on cohort
' -> doc

if (! interactive()) {
  library(docopt)
  args <- docopt(doc, version='Run MEDIPS v 1.0')
  print(args)
} else {
  message('Running in interactive mode. Be sure to specify args manually.')
}

library(GenomicRanges)
library(BSgenome)
library(BSgenome.Hsapiens.UCSC.hg38)
library(IRanges)
library(MEDIPS)
library(tidyverse)

BIN_WIDTH = 300
allmainchrs = paste0('chr', c(1:22))
BSgenome = 'BSgenome.Hsapiens.UCSC.hg38'
paired_val = (args[['paired']] == "True")

medips_set = MEDIPS.createSet(file = args[['bam']],
                                     BSgenome = BSgenome,
                                     extend = 0,
                                     shift = 0,
                                     uniq = 1,
                                     window_size = BIN_WIDTH,
                                     paired = paired_val,
                                     chr.select = allmainchrs)

chr.select = medips_set@chr_names
window_size = window_size(medips_set)
chr_lengths = unname( seqlengths(BSgenome.Hsapiens.UCSC.hg38)[ seqnames(BSgenome.Hsapiens.UCSC.hg38@seqinfo)%in%chr.select ] )
no_chr_windows = ceiling(chr_lengths/window_size)
supersize_chr = cumsum(no_chr_windows)
chromosomes = chr.select

all.Granges.genomeVec = MEDIPS.GenomicCoordinates(supersize_chr, no_chr_windows, chromosomes, chr_lengths, window_size)
all.Granges.genomeVec$counts = medips_set@genome_count
all.Granges.genomeVec$cpm = (medips_set@genome_count/medips_set@number_regions)*1000000

count_df <- as.data.frame(all.Granges.genomeVec)
colnames(count_df) <- c("bin_chr","bin_start","bin_end","bin_width","strand","bin_counts","bin_cpm")
count_df = count_df[, c("bin_chr","bin_start","bin_end","bin_counts","bin_cpm")]

write_tsv(count_df, file = args[['output']],  col_names = TRUE) 

medipsenrichment <- tryCatch({
  medips_enrichment = MEDIPS.CpGenrich(file = args[['bam']],
                                       BSgenome = BSgenome,
                                       extend = 0,
                                       shift = 0,
                                       uniq = 1,
                                       paired = paired_val,
                                       chr.select = allmainchrs)
  return(TRUE)
}, error = function(e){
  message('Error: unable to create medips enrichment paramaters')
  return(FALSE)
}
)

medips_coverage = MEDIPS.seqCoverage(file = args[['bam']],
                                     pattern = "CG",
                                     BSgenome = BSgenome,
                                     extend = 0,
                                     shift = 0,
                                     uniq = 1,
                                     paired = paired_val,
                                     chr.select = allmainchrs)

medips_saturation = MEDIPS.saturation(file= args[['bam']],
                                      BSgenome = BSgenome,
                                      extend = 0,
                                      shift = 0,
                                      uniq = 1,
                                      window_size = BIN_WIDTH,
                                      nit = 10,
                                      nrit = 1,
                                      empty_bins = TRUE,
                                      rank = FALSE,
                                      chr.select = allmainchrs,
                                      paired = paired_val)

#generating the seqCoverage just on the unique reads
cov.level = c(0, 1, 2, 3, 4, 5)
cov.res = medips_coverage$cov.res
numberReads = medips_coverage$numberReads
numberReadsWO = medips_coverage$numberReadsWO
numberReadsWO_percentage = round((numberReadsWO/numberReads * 100), digits = 2)

results = NULL
for (j in 1:length(cov.level)) {
  if (j == 1) {
    results = c(results, length(cov.res[cov.res <= cov.level[j]])/length(cov.res) * 100)
  }
  else {
    results = c(results, length(cov.res[cov.res > cov.level[j - 1] & cov.res <= cov.level[j]])/length(cov.res) * 100)
  }
}
results = c(results, length(cov.res[cov.res > cov.level[length(cov.level)]])/length(cov.res) * 100)

if(medipsenrichment){
  MEDIPS_EnrichmentScore_GoGe = medips_enrichment$enrichment.score.GoGe
  MEDIPS_EnrichmentScore_relH = medips_enrichment$enrichment.score.relH
}else{
  MEDIPS_EnrichmentScore_GoGe = NA
  MEDIPS_EnrichmentScore_relH = NA
}

QCstats = data.frame(numReads_Unique_MEDIPS = medips_coverage$numberReads,
                     MEDIPS_Enrichment = medipsenrichment,
                     EnrichmentScore_GoGe = MEDIPS_EnrichmentScore_GoGe,
                     EnrichmentScore_relH = MEDIPS_EnrichmentScore_relH,
                     Percent_CpG_Seq_Coverage_0x = results[1],
                     Percent_CpG_Seq_Coverage_1x = results[2],
                     Percent_CpG_Seq_Coverage_2x = results[3],
                     Percent_CpG_Seq_Coverage_3x = results[4],
                     Percent_CpG_Seq_Coverage_4x = results[5],
                     Percent_CpG_Seq_Coverage_5x = results[6],
                     Percent_CpG_Seq_Coverage_Over5x = results[7], 
                     Reads_do_not_cover_CpG = medips_coverage$numberReadsWO,
                     Percent_Reads_do_not_cover_CpG = numberReadsWO_percentage,
                     Estimated_Saturation_Correlation = medips_saturation$maxEstCor[2],
                     True_Saturation_Correlation = medips_saturation$maxTruCor[2])


write_tsv(QCstats, file=args[['qcout']],  col_names = TRUE) #save QC metrics

rm(list = ls())
gc()

R tidyverse BSgenome.Hsapiens.UCSC.hg38 GenomicRanges docopt BSgenome IRanges MEDIPS From line 1 of R/run_MEDIPS.R

' run_QSEA.R
Run QSEA for counts and beta value estimation and conduct MEDIPS QC.

Usage:
    run_QSEA.R -s SAMPLE -c CHROM -b BAM -o OUTPUT --count Count --beta BETA --qc QCOut [ --group GROUP ]

Options:
    -s --sample SAMPLE          Name of sample
    -c --chrom CHROM            Chromosome
    -b --bam BAM                Path to input BAM file
    -o --output OUTPUT          Output path

    --count Count               Output path for count data
    --beta BETA                 Output path for beta methylation estimate
    --qc QCOut                  Output path for qc matrix

    --group GROUP               Optional input of whether sample belongs to a group,
                                  such as "treatment" or "control"
' -> doc

if (! interactive()) {
  library(docopt)
  args <- docopt(doc, version='Run QSEA v 1.0')
  print(args)
} else {
  message('Running in interactive mode. Be sure to specify args manually.')
}

library(GenomicRanges)
library(BSgenome)
library(BSgenome.Hsapiens.UCSC.hg38)
library(IRanges)
library(qsea)
library(tidyverse)
library(BiocParallel)

register(MulticoreParam(workers=4))

BIN_WIDTH = 300
chrom = args[['chrom']]
BSgenome = 'BSgenome.Hsapiens.UCSC.hg38'
mapq = 30 

if (!is.null(args[['group']])) {
  sample_group = args[['group']]
} else {
  sample_group = "unspecified"
}

sample_info <- data.frame(
  sample_name = args[['sample']],
  file_name = args[['bam']],
  group = sample_group
)

qseaset <- createQseaSet(
  sampleTable = sample_info,
  BSgenome = BSgenome,
  window_size = BIN_WIDTH,
  chr.select = chrom,
)

qseaset = addCoverage(qseaset, uniquePos = TRUE, paired = TRUE, parallel = TRUE, minMapQual = mapq)
qseaset = addPatternDensity(qseaset, "CG", name = "CpG")
qseaset = addLibraryFactors(qseaset)
qseaset = addOffset(qseaset, enrichmentPattern = "CpG")

wd = which(getRegions(qseaset)$CpG_density>1 &
           getRegions(qseaset)$CpG_density<15)
signal = (15-getRegions(qseaset)$CpG_density[wd])*.55/15+.25
signal = matrix(signal,nrow=length(signal),ncol=length(getSampleNames(qseaset)))

qseaenrichment <- tryCatch({
  qseaset = addEnrichmentParameters(
    qseaset,
    enrichmentPattern="CpG", 
    windowIdx=wd,
    signal=signal
  ) 
  return(TRUE)
}, error = function(e){
  message('Error: unable to create enrichment paramaters')
  return(FALSE)
}
)

if(qseaenrichment){
  output_beta <- makeTable(
    qseaset,
    norm_methods = c("beta"),
    samples = getSampleNames(qseaset)
  )
  output_counts <- makeTable(
    qseaset,
    norm_methods = c("counts","rpm"),
    samples = getSampleNames(qseaset)
  )
}else{
  output_counts <- makeTable(
    qseaset,
    norm_methods = c("counts","rpm"),
    samples = getSampleNames(qseaset)
  )
  output_beta <- output_counts[,1:4]
  output_beta$beta <- rep(NA, nrow(output_beta))
  colnames(output_beta) <- c("chr","window_start","window_end","CpG_density",paste(args[['sample']],"_beta",sep = ""))
}

qseaset_percentfragsbackground = getOffset(qseaset) * 100

QCstats = data.frame(numReads_Unique_QSEA = qseaset@libraries$file_name[1,"valid_fragments"],
                     QSEA_Percent_Fragments_due_Background = qseaset_percentfragsbackground,
                     QSEA_Enrichment = qseaenrichment)

## write out

setwd(args[['output']])

write_tsv(output_counts, file = args[['count']], col_names = TRUE)

write_tsv(output_beta, file = args[['beta']], col_names = TRUE)



write_tsv(QCstats, file = args[['qc']],  col_names = TRUE) #save QC metrics

if(qseaenrichment){
  png(file = paste("EnrichmentProfile",args[['chrom']],".png", sep = ""), width = 480, height = 480, units = "px")
  plotEPmatrix(qseaset)
  dev.off()
}

rm(list = ls())
gc()

R tidyverse BSgenome.Hsapiens.UCSC.hg38 GenomicRanges docopt BSgenome IRanges qsea From line 1 of R/run_QSEA.R

Easy Copy Number Analysis (EaCoN) Pipeline

renv::activate()
library(EaCoN)


# 0.2 -- Parse Snakemake arguments
input <- snakemake@input
params <- snakemake@params
nthreads <- snakemake@threads
output <- snakemake@output


# 0.3 -- Load platform specific dependencies
if (grepl("snp|cytoscan|oncoscan", params$array_type, ignore.case=TRUE))
    library(affy.CN.norm.data)

if (grepl('snp', params$array_type)) {
        library(apt.snp6.1.20.0)
        library(rcnorm)
        library(GenomeWideSNP.6.na35.r1)
}
## FIXME:: Add conditional dependencies for other platforms

switch(params$reference,
    'BSgenome.Hsapiens.UCSC.hg19'=library(BSgenome.Hsapiens.UCSC.hg19),
    'BSgenome.Hsapiens.UCSC.hg38'={
        if (grepl("snp", params$array_type, ignore.case=TRUE))
            stop("Must use BSgenome.Hsapiens.UCSC.hg19 for GenomeWide SNP6 arrays!")
        library(BSgenome.Hsapiens.UCSC.hg38)
    }
)


# 1 -- Load or create the metadata file specifying CEL paths
if (file.exists(input$pairs_file)) {
    pairs_df <- read.table(input$pairs_file, sep="\t", header=TRUE,
        stringsAsFactors=FALSE)
} else {
    # find all CEL files in the raw data
    cel_file_paths <- list.files(params$rawdata, pattern="*.CEL$",
        recursive=TRUE, full.names=TRUE)
    pairs_df <- data.frame(
        cel_files=cel_file_paths,
        # assumes the second element in path is the sample name
        SampleName=vapply(cel_file_paths, FUN=function(x) strsplit(x)[[1]][2],
            FUN.VALUE=character(1))
    )
    if (!is.null(input$pairs_file) || input$paris_file == "") {
        # create path if it doesn't exist
        pairs_path <- dirname(input$pairs_file)
        if (!file.exists(pairs_path)) dir.create(pairs_path, recursive=TRUE)
        # write out a pairs file
        write.table(pairs_df, input$pairs_file)
    }
}


# 2 -- Format the paths in the pairs_file to match this project directory
#   structure from config.yaml
if (grepl('cytocscan|oncoscan', params$array_type, ignore.case=TRUE)) {
    stopifnot(c("ATChannelCel", "GCChannelCel", "SampleName") %in% colnames(pairs_df))
    # remove existing path, if there is one
    pairs_df$ATChannelCel <- gsub("^.*\\/", "", pairs_df$ATChannelCel)
    pairs_df$GCChannelCel <- gsub("^.*\\/", "", pairs_df$GCChannelCel)
    # create a new path relative to specified rawdata directory
    pairs_df$GCChannelCel <- file.path(getwd(), params$rawdata,
        pairs_df$GCChannelCel)
} else if (grepl('snp6', params$array_type, ignore.case=TRUE)) {
    stopifnot(all(c("cel_files", "SampleName") %in% colnames(pairs_df)))
    # pairs_df$cel_files <- gsub("^.*\\/", "", pairs_df$cel_files)
    # pairs_df$cel_files <- file.path(getwd(), params$rawdata, pairs_df$cel_files)
}
# output the file to temporary storage so it can be read by EaCoN
pairs_file <- file.path(tempdir(), "CEL_pairs_file.csv")
write.table(pairs_df, file=pairs_file, sep="\t")


# 3 -- Preprocess and normalize the raw data; does
if (grepl('cytoscan', params$array_type, ignore.case=TRUE)) {
    EaCoN:::CS.Process.Batch(pairs_file,
        nthread=nthreads, out.dir=params$procdata, force=TRUE,
        cluter.type=params$cluster_type)
} else if (grepl('oncoscan', params$array_type, ignore.case=TRUE)) {
    EaCoN:::OS.Process.Batch(pairs_file,
        nthread=nthreads, out.dir=params$procdata, force=TRUE,
        cluster.type=params$cluster_type)
} else if (grepl('snp', params$array_type, ignore.case=TRUE)) {
    EaCoN:::SNP6.Process.Batch(pairs_file, out.dir=params$procdata, force=TRUE,
        nthread=nthreads, cluster.type=params$cluster_type)
} else if (grepl('wes', params$array_type, ignore.case=TRUE)) {
    stop("WES has not been implemented in this pipeline yet, please see
        https://github.com/gustaveroussy/EaCoN for information on
        setting up your own analysis script.")
} else {
    stop("Supported assay families are wes, cytoscan, oncoscan and snp6")
}

R BSgenome.Hsapiens.UCSC.hg38 EaCoN From line 2 of scripts/1_batchProcessRawdataFiles.R

Reproducible reanalysis of a combined ChIP-Seq & RNA-Seq data set

suppressMessages({
    library(rtracklayer)
    library(assertthat)
    library(BSgenome.Hsapiens.UCSC.hg38)
})

{
    outfile <- snakemake@output[[1]]
    assert_that(is.character(outfile))

    mySession <- browserSession()
    genome(mySession) <- "hg38"
    tab <- getTable(ucscTableQuery(mySession, "cpgIslandExtUnmasked"))
    gr <- makeGRangesFromDataFrame(tab, start.field = "chromStart", end.field = "chromEnd",
                                   starts.in.df.are.0based = TRUE, keep.extra.columns = TRUE,
                                   seqinfo = seqinfo(BSgenome.Hsapiens.UCSC.hg38))
    ## GRanges already knows the length of each feature, so this field is
    ## redundant.
    assert_that(all(width(gr) == gr$length))
    mcols(gr)$length <- NULL
    seqinfo(gr) <- seqinfo(BSgenome.Hsapiens.UCSC.hg38)

    saveRDS(gr, outfile)
}

R BSgenome.Hsapiens.UCSC.hg38 assertthat From line 1 of scripts/get-CpG.R

Run MeDEStrand with bedpe input as part of PLBR database workflow (v0.2.0)

library(docopt)
## adapted from https://github.com/oicr-gsi/wf_cfmedip/blob/master/workflow/runMedips/runMedips.r

doc <- "Get MEDIPS QC metrics.
Usage:
    QC_MEDIPS.R --bamFile <FILE> --outputDir <DIR> --windowSize <SIZE> [ --genome <GENOME> ]

Options:
    --bamFile FILE       Aligned, sorted, filtered reads (bam)
    --outputDir DIR      Path to output folder
    --windowSize SIZE    Size of genomic windows (bp, e.g. 300)
    --genome GENOME      Path to a folder containing a custom BSgenome as a package, 
                             which will be loaded using devtools::load_all();
                             or the name of BSgenome (usually BSgenome.Hsapiens.UCSC.hg38
                             or BSgenome.Athaliana.TAIR.TAIR9)
    --help               show this help text
"
opt <- docopt(doc)

library(tidyverse)
library(gtools)
library(MEDIPS)
library(BSgenome.Hsapiens.UCSC.hg38)

#if (file.exists(paste(opt[['genome']], 'DESCRIPTION', sep='/'))) {
#    devtools::load_all(opt[['genome']])
#    bsgenome <- getBSgenome(basename(opt[['genome']]))
#} else {
#    bsgenome <- getBSgenome(opt[['genome']])
#}

if (!file.exists(opt$bamFile)){
  stop(paste0("ERROR: bam file not found ", opt$bamFile), call.=FALSE)
}

## get user parameters
bam_file = opt$bamFile
ws = as.numeric(opt$windowSize)
out_dir = paste0(opt$outputDir, "/")
chr.select=paste0("chr", c(1:22,"X","Y"))
#chr.select=c("chr1","chr22")

BSgenome="BSgenome.Hsapiens.UCSC.hg38"
uniq = 0 ## WARNING: default settings normalize the data, must be set to 0 to disable this transformation
extend = 0 ## relevant for single-end: https://support.bioconductor.org/p/81098/
shift = 0
paired = TRUE

# disables the scientific notation to avoid powers in genomic coordinates (i.e. 1e+10)
options(scipen = 999)

## create MEDIPS set ###################################
message("Processing MEDIPS QC metrics: window size: ", ws)
MeDIPset =
  MEDIPS.createSet(file = bam_file,
                   BSgenome = BSgenome,
                   uniq = uniq,
                   extend = extend,
                   shift = shift,
                   paired = paired,
                   window_size = ws,
                   chr.select = chr.select)

fname <- unlist(strsplit(basename(bam_file),split="\\."))[1]
# fname

## coupling set: maps CG densities across the genome
CS = MEDIPS.couplingVector(pattern="CG", refObj=MeDIPset)

## saturation analysis #################################
## whether a given set of mapped reads is sufficient to generate a saturated and reproducible coverage profile
## calculates Pearson correlation of coverage profile between exclusive sets A and B from a sample

sr =
  MEDIPS.saturation(file = bam_file,
                    BSgenome = BSgenome,
                    uniq = uniq,
                    extend = extend,
                    shift = shift,
                    paired = paired,
                    window_size = ws,
                    chr.select = chr.select,
                    nit = 10, nrit = 1, empty_bins = TRUE, rank = FALSE)
print(paste0("Estimated correlation is: ", round(sr$maxEstCor[2], 5)))
print(paste0("True correlation is: ", round(sr$maxTruCor[2],5)))

pdf(paste0(out_dir, fname, ".MEDIPS.SaturationPlot.pdf"), width = 5, height = 4)
MEDIPS.plotSaturation(sr)
dev.off()

## sequence coverage analysis ##########################
## outputs #of CpGs covered/not covered in a sample

cr =
  MEDIPS.seqCoverage(file = bam_file,
                     pattern = "CG",
                     BSgenome = BSgenome,
                     uniq = uniq,
                     extend = extend,
                     shift = shift,
                     paired = paired,
                     chr.select = chr.select)
print(paste0("Total number of reads: ", cr$numberReads))
print(paste0("Number of reads NOT covering a CpG: ", cr$numberReadsWO))
print(paste0("Fraction of reads NOT covering a CpG: ", round(cr$numberReadsWO / cr$numberReads, 5)))

print(paste0("Number of CpGs in reference: ", length(cr$cov.res)))
print(paste0("Number of CpG not covered by a read: ", length(cr$cov.res[cr$cov.res < 1])))
print(paste0("Number of CpG covered by 1 read: ", length(cr$cov.res[cr$cov.res == 1])))
print(paste0("Number of CpG covered by 2 reads: ", length(cr$cov.res[cr$cov.res == 2])))
print(paste0("Number of CpG covered by 3 reads: ", length(cr$cov.res[cr$cov.res == 3])))
print(paste0("Number of CpG covered by 4 reads: ", length(cr$cov.res[cr$cov.res == 4])))
print(paste0("Number of CpG covered by 5 reads: ", length(cr$cov.res[cr$cov.res == 5])))
print(paste0("Number of CpG covered by >5 reads: ", length(cr$cov.res[cr$cov.res > 5])))

print(paste0("Fraction of CpG not covered by a read: ", round(length(cr$cov.res[cr$cov.res < 1]) / length(cr$cov.res),5)))
print(paste0("Fraction of CpG covered by 1 read: ", round(length(cr$cov.res[cr$cov.res == 1]) / length(cr$cov.res),5)))
print(paste0("Fraction of CpG covered by 2 reads: ", round(length(cr$cov.res[cr$cov.res == 2]) / length(cr$cov.res),5)))
print(paste0("Fraction of CpG covered by 3 reads: ", round(length(cr$cov.res[cr$cov.res == 3]) / length(cr$cov.res),5)))
print(paste0("Fraction of CpG covered by 4 reads: ", round(length(cr$cov.res[cr$cov.res == 4]) / length(cr$cov.res),5)))
print(paste0("Fraction of CpG covered by 5 reads: ", round(length(cr$cov.res[cr$cov.res == 5]) / length(cr$cov.res),5)))
print(paste0("Fraction of CpG covered by >5 reads: ", round(length(cr$cov.res[cr$cov.res > 5]) / length(cr$cov.res),5)))


pdf(paste0(out_dir, fname, ".MEDIPS.seqCovPie.pdf"), width = 5, height = 4)
MEDIPS.plotSeqCoverage(seqCoverageObj=cr,
                       type="pie",
                       cov.level = c(0,1,2,3,4,5),
                       main="Sequence pattern coverage, pie chart")
dev.off()

pdf(paste0(out_dir, fname, ".MEDIPS.seqCovHist.pdf"), width = 5, height = 4)
MEDIPS.plotSeqCoverage(seqCoverageObj=cr,
                       type="hist",
                       t = 15,
                       main="Sequence pattern coverage, histogram")
dev.off()

## CpG enrichment #####################################
## test CpG enrichment of given set of short reads covering a set of genomic regions vs reference genome
## regions.relH - relative freq of CpGs within a sample's immunoprecipitated regions
## genome.relH - relative freq of CpGs within reference genome
## enrichment.score.relH - regions.relH/genome.relH
## regions.GoGe - obs/exp ratio of CpGs within a sample's immunoprecipitated regions
## genome.GoGe - obs/exp ratio of CpGs within genomic regions
## enrichment.score.GoGe - regions.GoGe/genome.GoGe
## (relH and GoGe = 2 different ways of calculating enrichment)


## original MEDIPS.CpGenrich has IRanges issue
## this is adapted from script by Nick Cheng
MEDIPS.CpGenrichNew <-
  function(file=NULL, BSgenome=NULL, extend=0, shift=0, uniq=1e-3, chr.select=NULL, paired=F){

    ## Proof correctness....
    if(is.null(BSgenome)){stop("Must specify a BSgenome library.")}

    ## Read region file
    fileName=unlist(strsplit(file, "/"))[length(unlist(strsplit(file, "/")))]
    path=paste(unlist(strsplit(file, "/"))[1:(length(unlist(strsplit(file, "/"))))-1], collapse="/")
    if(path==""){path=getwd()}
    if(!fileName%in%dir(path)){stop(paste("File", fileName, " not found in", path, sep =" "))}

    dataset = get(ls(paste("package:", BSgenome, sep = ""))[1])

    if(!paired){GRange.Reads = getGRange(fileName, path, extend, shift, chr.select, dataset, uniq)}
    else{GRange.Reads = getPairedGRange(fileName, path, extend, shift, chr.select, dataset, uniq)}

    ## Sort chromosomes
    if(length(unique(seqlevels(GRange.Reads)))>1){chromosomes=mixedsort(unique(seqlevels(GRange.Reads)))}
    if(length(unique(seqlevels(GRange.Reads)))==1){chromosomes=unique(seqlevels(GRange.Reads))}

    ## Get chromosome lengths for all chromosomes within data set.
    cat(paste("Loading chromosome lengths for ",BSgenome, "...\n", sep=""))

    chr_lengths=as.numeric(seqlengths(dataset)[chromosomes])

    ranges(GRange.Reads) <- restrict(ranges(GRange.Reads),+1)

    ##Calculate CpG density for regions
    total=length(chromosomes)
    cat("Calculating CpG density for given regions...\n")

    ## new code ##################################
    readsChars <- unlist(getSeq(dataset, GRange.Reads, as.character=TRUE))

    regions.CG = sum(vcountPattern("CG",readsChars))
    regions.C  = sum(vcountPattern("C",readsChars))
    regions.G  = sum(vcountPattern("G",readsChars))
    all.genomic= sum(width(readsChars))

    nReads <- length(readsChars)
    ###############################################

    regions.relH=as.numeric(regions.CG)/as.numeric(all.genomic)*100
    regions.GoGe=(as.numeric(regions.CG)*as.numeric(all.genomic))/(as.numeric(regions.C)*as.numeric(regions.G))

    cat(paste("Calculating CpG density for the reference genome",
              BSgenome, "...\n", sep = " "))

    CG <- DNAStringSet("CG")
    pdict0 <- PDict(CG)
    params <- new("BSParams", X = dataset, FUN = countPDict, simplify = TRUE, exclude = c("rand", "chrUn"))
    genome.CG=sum(bsapply(params, pdict = pdict0))
    params <- new("BSParams", X = dataset, FUN = alphabetFrequency, exclude = c("rand", "chrUn"), simplify=TRUE)
    alphabet=bsapply(params)
    genome.l=sum(as.numeric(alphabet))
    genome.C=as.numeric(sum(alphabet[2,]))
    genome.G=as.numeric(sum(alphabet[3,]))
    genome.relH=genome.CG/genome.l*100
    genome.GoGe=(genome.CG*genome.l)/(genome.C*genome.G);

    ##Calculate CpG density for reference genome

    enrichment.score.relH=regions.relH/genome.relH
    enrichment.score.GoGe=regions.GoGe/genome.GoGe

    gc()
    return(list(genome=BSgenome,
                regions.CG=regions.CG,
                regions.C=regions.C,
                regions.G=regions.G,
                regions.relH=regions.relH,
                regions.GoGe=regions.GoGe,
                genome.C=genome.C,
                genome.G=genome.G,
                genome.CG=genome.CG,
                genome.relH=genome.relH,
                genome.GoGe=genome.GoGe,
                enrichment.score.relH=enrichment.score.relH,
                enrichment.score.GoGe=enrichment.score.GoGe))
  }

er =
  MEDIPS.CpGenrichNew(file = bam_file,
                   BSgenome = BSgenome,
                   uniq = uniq,
                   extend = extend,
                   shift = shift,
                   paired = paired,
                   chr.select = chr.select)

## medips.satr.est_cor and medips.satr.tru_cor involve randomness, will not give identical results on repeat runs
## rest of metrics should be identical on repeat runs

message("Writing out MEDIPS QC metrics: saturation, CpG coverage and CpG enrichment.")
QC_MEDIPS.df =
  data.frame(QC_type = rep("medips_QC", 33),
             metrics = c("ref_genome",
                         "satr.est_cor",
                         "satr.tru_cor",
                         "CpG_cov.totalNumReads",
                         "CpG_cov.numReadsWoCpG",
                         "CpG_cov.fracReadsWoCpG",
                         "CpG_cov.numCpGinRef",
                         "CpG_cov.numCpGwoReads",
                         "CpG_cov.numCpGw1read",
                         "CpG_cov.numCpGw2Reads",
                         "CpG_cov.numCpGw3Reads",
                         "CpG_cov.numCpGw4Reads",
                         "CpG_cov.numCpGw5Reads",
                         "CpG_cov.numCpGgt5Reads",
                         "CpG_cov.fracCpGwoReads",
                         "CpG_cov.fracCpGw1read",
                         "CpG_cov.fracCpGw2Reads",
                         "CpG_cov.fracCpGw3Reads",
                         "CpG_cov.fracCpGw4Reads",
                         "CpG_cov.fracCpGw5Reads",
                         "CpG_cov.fracCpGgt5Reads",
                         "enrich.regions.C",
                         "enrich.regions.G",
                         "enrich.regions.CG",
                         "enrich.genome.C",
                         "enrich.genome.G",
                         "enrich.genome.CG",
                         "enrich.regions.relH",
                         "enrich.genome.relH",
                         "enrich.regions.GoGe",
                         "enrich.genome.GoGe",
                         "enrich.enrichment.score.relH",
                         "enrich.enrichment.score.GoGe"),
             values = c(er$genome,
                        round(sr$maxEstCor[2], 5),
                        round(sr$maxTruCor[2], 5),
                        cr$numberReads,
                        cr$numberReadsWO,
                        round(cr$numberReadsWO / cr$numberReads, 5),
                        length(cr$cov.res),
                        length(cr$cov.res[cr$cov.res < 1]),
                        length(cr$cov.res[cr$cov.res == 1]),
                        length(cr$cov.res[cr$cov.res == 2]),
                        length(cr$cov.res[cr$cov.res == 3]),
                        length(cr$cov.res[cr$cov.res == 4]),
                        length(cr$cov.res[cr$cov.res == 5]),
                        length(cr$cov.res[cr$cov.res > 5]),
                        round(length(cr$cov.res[cr$cov.res < 1]) / length(cr$cov.res), 5),
                        round(length(cr$cov.res[cr$cov.res == 1]) / length(cr$cov.res), 5),
                        round(length(cr$cov.res[cr$cov.res == 2]) / length(cr$cov.res), 5),
                        round(length(cr$cov.res[cr$cov.res == 3]) / length(cr$cov.res), 5),
                        round(length(cr$cov.res[cr$cov.res == 4]) / length(cr$cov.res), 5),
                        round(length(cr$cov.res[cr$cov.res == 5]) / length(cr$cov.res), 5),
                        round(length(cr$cov.res[cr$cov.res > 5]) / length(cr$cov.res), 5),
                        er$regions.C,
                        er$regions.G,
                        er$regions.CG,
                        er$genome.C,
                        er$genome.G,
                        er$genome.CG,
                        round(er$regions.relH, 5),
                        round(er$genome.relH, 5),
                        round(er$regions.GoGe, 5),
                        round(er$genome.GoGe, 5),
                        round(er$enrichment.score.relH, 5),
                        round(er$enrichment.score.GoGe, 5)))
names(QC_MEDIPS.df) = c("QC_type", "metrics", fname)
# QC_MEDIPS.df

write.table(QC_MEDIPS.df, paste0(out_dir, fname, "_QC_MEDIPS.csv"), row.names=F, quote=F, sep = '\t')

R tidyverse BSgenome.Hsapiens.UCSC.hg38 docopt GTools MEDIPS From line 1 of QC/QC_MEDIPS.R

library(docopt)

doc <- "Usage:
MeDEStrandBEDPE.r --inputFile <FILE> --outputFile <FILE> --windowSize <SIZE> --chr_select <CHRS>

--inputFile FILE     Aligned, sorted, filtered bam, or bedpelean
--outputFile FILE    Bedgraph methylation profile
--windowSize SIZE    Size of genomic windows for methylation profiling
--chr_select CHRS    Chromosomes to analyze
--help               show this help text"
opt <- docopt(doc)

#if (!file.exists(opt$inputFile)){
#  stop(paste0("bam or bedpe file not found ",opt$inputFile), call.=FALSE)
#}
#if (!file.exists(opt$outputDir)){
#  dir.create(opt$outputDir)
#}

library(MeDEStrandBEDPE)
library("BSgenome.Hsapiens.UCSC.hg38")
library(GenomicRanges)

args=(commandArgs(TRUE))

# Retrieve user parameters
sample <- opt$inputFile
output <- opt$outputFile
ws <- as.numeric(opt$windowSize)
paired <- TRUE

#  Adapted from: https://github.com/jxu1234/MeDEStrand/blob/master/R/MeDEStrand.createSet.R
#  The original function  uses hardcoded hg19; here, we switch to hg38
MeDEStrand.binMethyl_hg38 <- function(MSetInput=NULL, CSet=NULL, ccObj=NULL, Granges = FALSE){
  for (i in 1:2) {
    if(is.list(MSetInput)){
      MSet=MSetInput[[i]]
    }
    signal =  genome_count(MSet)
    coupling = genome_CF(CSet)
    ccObj = MeDEStrand.calibrationCurve(MSet=MSet, CSet=CSet, input=F)
    index.max = which(ccObj$mean_signal== max(ccObj$mean_signal[1:ccObj$max_index]))
    MS = ccObj$mean_signal[1:index.max]
    CF = ccObj$coupling_level[1:index.max]
    model.data = data.frame( model.MS =  MS/max( MS), model.CF = CF)
    logistic.fit = glm(model.MS ~ model.CF, family=binomial(logit), data = model.data)
    if (i == 1) { cat("Estimating and correcting CG bias for reads mapped to the DNA positive strand...\n") }
    if (i == 2) { cat("Estimating and correcting CG bias for reads mapped to the DNA negative strand...\n") }
    estim=numeric(length(ccObj$mean_signal))  # all 0's
    low_range=1:index.max
    estim[low_range]=ccObj$mean_signal[low_range]
    high_range = ( length(low_range)+1 ):length(estim)
    y.predict = predict(logistic.fit, 
                        data.frame(model.CF = ccObj$coupling_level[high_range]), 
                        type ="response")*ccObj$mean_signal[ccObj$max_index]
    estim[high_range] = y.predict
    signal=signal/estim[coupling+1]
    signal[coupling==0]=0
    signal = log2(signal)
    signal[is.na(signal)] = 0
    minsignal=min(signal[signal!=-Inf])
    signal[signal!=-Inf]=signal[signal!=-Inf]+abs(minsignal)
    maxsignal = quantile(signal[signal!=Inf], 0.9995  )
    signal[signal!=Inf & signal>maxsignal]=maxsignal
    signal=round((signal/maxsignal), digits=2)
    signal[signal==-Inf | signal ==Inf]=0
    if (i == 1) {pos.signal = signal}
    if (i == 2) {neg.signal = signal}
  }
  merged.signal = (pos.signal+neg.signal)/2
  if(!Granges) {
    return(merged.signal)}else{
      chr.select = MSet@chr_names
      window_size = window_size(MSet)
      chr_lengths=unname(seqlengths(BSgenome.Hsapiens.UCSC.hg38)[ seqnames(BSgenome.Hsapiens.UCSC.hg38@seqinfo)%in%chr.select])
      no_chr_windows = ceiling(chr_lengths/window_size)
      supersize_chr = cumsum(no_chr_windows)
      chromosomes=chr.select
      all.Granges.genomeVec = MEDIPS.GenomicCoordinates(supersize_chr, no_chr_windows, chromosomes, chr_lengths, window_size)
      all.Granges.genomeVec$CF = CS@genome_CF
      all.Granges.genomeVec$binMethyl= merged.signal
      return( all.Granges.genomeVec )
    }
}

# Disables the scientific notation to avoid powers in genomic coordinates (i.e. 1e+10)
options(scipen = 999)

# Set global variables for importing short reads. For details, in R console, type "?MeDEStrand.createSet"
BSgenome="BSgenome.Hsapiens.UCSC.hg38"
uniq = 1
extend = 200
shift = 0
## { change this later to be dynamic }
chr.select = strsplit(opt$chr_select, " ")[[1]]
print(chr.select)
#chr.select = paste0("chr", c(1:22,"X","Y"))

#fname <- unlist(strsplit(basename(opt$inputFile),split="\\."))[1]
#df_for_wig <- NULL
#bed_wig_output <- paste0(opt$outputDir,"/MeDEStrand_hg38_",fname,"_ws",ws,"_wig.bed")

output_df = NULL

tryCatch({

  # Create a MeDIP set
  MeDIP_seq = MeDEStrand.createSet(file=opt$inputFile, BSgenome=BSgenome, extend=extend, shift=shift, uniq=uniq, window_size=ws, chr.select=chr.select, paired=paired)

  #  Count CpG pattern in the bins
  CS = MeDEStrand.countCG(pattern="CG", refObj=MeDIP_seq)

  # Infer genome-wide absolute methylation levels:
  #result.methylation = MeDEStrand.binMethyl(MSetInput = MeDIP_seq, CSet = CS, Granges = TRUE)
  result.methylation = MeDEStrand.binMethyl_hg38(MSetInput = MeDIP_seq, CSet = CS, Granges = TRUE)

  # Create a dataframe from the previous GRanges object.
  # Warning: GRanges and UCSC BED files use different conventions for the genomic coordinates
  # GRanges use 1-based intervals (chr1:2-8 means the 2nd till and including the 8th base of chr1, i.e. a range of length of 7 bases)
  # UCSC bed-files use 0-based coordinates (chr1:2-8 means the 3rd base till and including the 8th base, i.e. a range of length of 6 bases)

  # Dataframe for generating a bed file used to generate then a wig file
  output_df <- data.frame(seqnames=seqnames(result.methylation),
                          starts=start(result.methylation)-1,
                          ends=end(result.methylation),
                          scores=elementMetadata(result.methylation)$binMethyl)

}, error = function(e){
  message("Error: MeDEStrand CpG density normalization failed due to small number of reads")
})

write.table(output_df, file = output, quote=F, sep="\t", row.names=F, col.names=F)

R BSgenome.Hsapiens.UCSC.hg38 GenomicRanges docopt From line 3 of R/MeDEStrandBEDPE.R

Run MedRemix with bedpe input as part of PLBR database workflow (v0.2.0)

library(docopt)
## adapted from https://github.com/oicr-gsi/wf_cfmedip/blob/master/workflow/runMedips/runMedips.r

doc <- "Get MEDIPS QC metrics.
Usage:
    QC_MEDIPS.R --bamFile <FILE> --outputDir <DIR> --windowSize <SIZE> [ --genome <GENOME> ]

Options:
    --bamFile FILE       Aligned, sorted, filtered reads (bam)
    --outputDir DIR      Path to output folder
    --windowSize SIZE    Size of genomic windows (bp, e.g. 300)
    --genome GENOME      Path to a folder containing a custom BSgenome as a package, 
                             which will be loaded using devtools::load_all();
                             or the name of BSgenome (usually BSgenome.Hsapiens.UCSC.hg38
                             or BSgenome.Athaliana.TAIR.TAIR9)
    --help               show this help text
"
opt <- docopt(doc)

library(tidyverse)
library(gtools)
library(MEDIPS)
library(BSgenome.Hsapiens.UCSC.hg38)

#if (file.exists(paste(opt[['genome']], 'DESCRIPTION', sep='/'))) {
#    devtools::load_all(opt[['genome']])
#    bsgenome <- getBSgenome(basename(opt[['genome']]))
#} else {
#    bsgenome <- getBSgenome(opt[['genome']])
#}

if (!file.exists(opt$bamFile)){
  stop(paste0("ERROR: bam file not found ", opt$bamFile), call.=FALSE)
}

## get user parameters
bam_file = opt$bamFile
ws = as.numeric(opt$windowSize)
out_dir = paste0(opt$outputDir, "/")
chr.select=paste0("chr", c(1:22,"X","Y"))
#chr.select=c("chr1","chr22")

BSgenome="BSgenome.Hsapiens.UCSC.hg38"
uniq = 0 ## WARNING: default settings normalize the data, must be set to 0 to disable this transformation
extend = 0 ## relevant for single-end: https://support.bioconductor.org/p/81098/
shift = 0
paired = TRUE

# disables the scientific notation to avoid powers in genomic coordinates (i.e. 1e+10)
options(scipen = 999)

## create MEDIPS set ###################################
message("Processing MEDIPS QC metrics: window size: ", ws)
MeDIPset =
  MEDIPS.createSet(file = bam_file,
                   BSgenome = BSgenome,
                   uniq = uniq,
                   extend = extend,
                   shift = shift,
                   paired = paired,
                   window_size = ws,
                   chr.select = chr.select)

fname <- unlist(strsplit(basename(bam_file),split="\\."))[1]
# fname

## coupling set: maps CG densities across the genome
CS = MEDIPS.couplingVector(pattern="CG", refObj=MeDIPset)

## saturation analysis #################################
## whether a given set of mapped reads is sufficient to generate a saturated and reproducible coverage profile
## calculates Pearson correlation of coverage profile between exclusive sets A and B from a sample

sr =
  MEDIPS.saturation(file = bam_file,
                    BSgenome = BSgenome,
                    uniq = uniq,
                    extend = extend,
                    shift = shift,
                    paired = paired,
                    window_size = ws,
                    chr.select = chr.select,
                    nit = 10, nrit = 1, empty_bins = TRUE, rank = FALSE)
print(paste0("Estimated correlation is: ", round(sr$maxEstCor[2], 5)))
print(paste0("True correlation is: ", round(sr$maxTruCor[2],5)))

pdf(paste0(out_dir, fname, ".MEDIPS.SaturationPlot.pdf"), width = 5, height = 4)
MEDIPS.plotSaturation(sr)
dev.off()

## sequence coverage analysis ##########################
## outputs #of CpGs covered/not covered in a sample

cr =
  MEDIPS.seqCoverage(file = bam_file,
                     pattern = "CG",
                     BSgenome = BSgenome,
                     uniq = uniq,
                     extend = extend,
                     shift = shift,
                     paired = paired,
                     chr.select = chr.select)
print(paste0("Total number of reads: ", cr$numberReads))
print(paste0("Number of reads NOT covering a CpG: ", cr$numberReadsWO))
print(paste0("Fraction of reads NOT covering a CpG: ", round(cr$numberReadsWO / cr$numberReads, 5)))

print(paste0("Number of CpGs in reference: ", length(cr$cov.res)))
print(paste0("Number of CpG not covered by a read: ", length(cr$cov.res[cr$cov.res < 1])))
print(paste0("Number of CpG covered by 1 read: ", length(cr$cov.res[cr$cov.res == 1])))
print(paste0("Number of CpG covered by 2 reads: ", length(cr$cov.res[cr$cov.res == 2])))
print(paste0("Number of CpG covered by 3 reads: ", length(cr$cov.res[cr$cov.res == 3])))
print(paste0("Number of CpG covered by 4 reads: ", length(cr$cov.res[cr$cov.res == 4])))
print(paste0("Number of CpG covered by 5 reads: ", length(cr$cov.res[cr$cov.res == 5])))
print(paste0("Number of CpG covered by >5 reads: ", length(cr$cov.res[cr$cov.res > 5])))

print(paste0("Fraction of CpG not covered by a read: ", round(length(cr$cov.res[cr$cov.res < 1]) / length(cr$cov.res),5)))
print(paste0("Fraction of CpG covered by 1 read: ", round(length(cr$cov.res[cr$cov.res == 1]) / length(cr$cov.res),5)))
print(paste0("Fraction of CpG covered by 2 reads: ", round(length(cr$cov.res[cr$cov.res == 2]) / length(cr$cov.res),5)))
print(paste0("Fraction of CpG covered by 3 reads: ", round(length(cr$cov.res[cr$cov.res == 3]) / length(cr$cov.res),5)))
print(paste0("Fraction of CpG covered by 4 reads: ", round(length(cr$cov.res[cr$cov.res == 4]) / length(cr$cov.res),5)))
print(paste0("Fraction of CpG covered by 5 reads: ", round(length(cr$cov.res[cr$cov.res == 5]) / length(cr$cov.res),5)))
print(paste0("Fraction of CpG covered by >5 reads: ", round(length(cr$cov.res[cr$cov.res > 5]) / length(cr$cov.res),5)))


pdf(paste0(out_dir, fname, ".MEDIPS.seqCovPie.pdf"), width = 5, height = 4)
MEDIPS.plotSeqCoverage(seqCoverageObj=cr,
                       type="pie",
                       cov.level = c(0,1,2,3,4,5),
                       main="Sequence pattern coverage, pie chart")
dev.off()

pdf(paste0(out_dir, fname, ".MEDIPS.seqCovHist.pdf"), width = 5, height = 4)
MEDIPS.plotSeqCoverage(seqCoverageObj=cr,
                       type="hist",
                       t = 15,
                       main="Sequence pattern coverage, histogram")
dev.off()

## CpG enrichment #####################################
## test CpG enrichment of given set of short reads covering a set of genomic regions vs reference genome
## regions.relH - relative freq of CpGs within a sample's immunoprecipitated regions
## genome.relH - relative freq of CpGs within reference genome
## enrichment.score.relH - regions.relH/genome.relH
## regions.GoGe - obs/exp ratio of CpGs within a sample's immunoprecipitated regions
## genome.GoGe - obs/exp ratio of CpGs within genomic regions
## enrichment.score.GoGe - regions.GoGe/genome.GoGe
## (relH and GoGe = 2 different ways of calculating enrichment)


## original MEDIPS.CpGenrich has IRanges issue
## this is adapted from script by Nick Cheng
MEDIPS.CpGenrichNew <-
  function(file=NULL, BSgenome=NULL, extend=0, shift=0, uniq=1e-3, chr.select=NULL, paired=F){

    ## Proof correctness....
    if(is.null(BSgenome)){stop("Must specify a BSgenome library.")}

    ## Read region file
    fileName=unlist(strsplit(file, "/"))[length(unlist(strsplit(file, "/")))]
    path=paste(unlist(strsplit(file, "/"))[1:(length(unlist(strsplit(file, "/"))))-1], collapse="/")
    if(path==""){path=getwd()}
    if(!fileName%in%dir(path)){stop(paste("File", fileName, " not found in", path, sep =" "))}

    dataset = get(ls(paste("package:", BSgenome, sep = ""))[1])

    if(!paired){GRange.Reads = getGRange(fileName, path, extend, shift, chr.select, dataset, uniq)}
    else{GRange.Reads = getPairedGRange(fileName, path, extend, shift, chr.select, dataset, uniq)}

    ## Sort chromosomes
    if(length(unique(seqlevels(GRange.Reads)))>1){chromosomes=mixedsort(unique(seqlevels(GRange.Reads)))}
    if(length(unique(seqlevels(GRange.Reads)))==1){chromosomes=unique(seqlevels(GRange.Reads))}

    ## Get chromosome lengths for all chromosomes within data set.
    cat(paste("Loading chromosome lengths for ",BSgenome, "...\n", sep=""))

    chr_lengths=as.numeric(seqlengths(dataset)[chromosomes])

    ranges(GRange.Reads) <- restrict(ranges(GRange.Reads),+1)

    ##Calculate CpG density for regions
    total=length(chromosomes)
    cat("Calculating CpG density for given regions...\n")

    ## new code ##################################
    readsChars <- unlist(getSeq(dataset, GRange.Reads, as.character=TRUE))

    regions.CG = sum(vcountPattern("CG",readsChars))
    regions.C  = sum(vcountPattern("C",readsChars))
    regions.G  = sum(vcountPattern("G",readsChars))
    all.genomic= sum(width(readsChars))

    nReads <- length(readsChars)
    ###############################################

    regions.relH=as.numeric(regions.CG)/as.numeric(all.genomic)*100
    regions.GoGe=(as.numeric(regions.CG)*as.numeric(all.genomic))/(as.numeric(regions.C)*as.numeric(regions.G))

    cat(paste("Calculating CpG density for the reference genome",
              BSgenome, "...\n", sep = " "))

    CG <- DNAStringSet("CG")
    pdict0 <- PDict(CG)
    params <- new("BSParams", X = dataset, FUN = countPDict, simplify = TRUE, exclude = c("rand", "chrUn"))
    genome.CG=sum(bsapply(params, pdict = pdict0))
    params <- new("BSParams", X = dataset, FUN = alphabetFrequency, exclude = c("rand", "chrUn"), simplify=TRUE)
    alphabet=bsapply(params)
    genome.l=sum(as.numeric(alphabet))
    genome.C=as.numeric(sum(alphabet[2,]))
    genome.G=as.numeric(sum(alphabet[3,]))
    genome.relH=genome.CG/genome.l*100
    genome.GoGe=(genome.CG*genome.l)/(genome.C*genome.G);

    ##Calculate CpG density for reference genome

    enrichment.score.relH=regions.relH/genome.relH
    enrichment.score.GoGe=regions.GoGe/genome.GoGe

    gc()
    return(list(genome=BSgenome,
                regions.CG=regions.CG,
                regions.C=regions.C,
                regions.G=regions.G,
                regions.relH=regions.relH,
                regions.GoGe=regions.GoGe,
                genome.C=genome.C,
                genome.G=genome.G,
                genome.CG=genome.CG,
                genome.relH=genome.relH,
                genome.GoGe=genome.GoGe,
                enrichment.score.relH=enrichment.score.relH,
                enrichment.score.GoGe=enrichment.score.GoGe))
  }

er =
  MEDIPS.CpGenrichNew(file = bam_file,
                   BSgenome = BSgenome,
                   uniq = uniq,
                   extend = extend,
                   shift = shift,
                   paired = paired,
                   chr.select = chr.select)

## medips.satr.est_cor and medips.satr.tru_cor involve randomness, will not give identical results on repeat runs
## rest of metrics should be identical on repeat runs

message("Writing out MEDIPS QC metrics: saturation, CpG coverage and CpG enrichment.")
QC_MEDIPS.df =
  data.frame(QC_type = rep("medips_QC", 33),
             metrics = c("ref_genome",
                         "satr.est_cor",
                         "satr.tru_cor",
                         "CpG_cov.totalNumReads",
                         "CpG_cov.numReadsWoCpG",
                         "CpG_cov.fracReadsWoCpG",
                         "CpG_cov.numCpGinRef",
                         "CpG_cov.numCpGwoReads",
                         "CpG_cov.numCpGw1read",
                         "CpG_cov.numCpGw2Reads",
                         "CpG_cov.numCpGw3Reads",
                         "CpG_cov.numCpGw4Reads",
                         "CpG_cov.numCpGw5Reads",
                         "CpG_cov.numCpGgt5Reads",
                         "CpG_cov.fracCpGwoReads",
                         "CpG_cov.fracCpGw1read",
                         "CpG_cov.fracCpGw2Reads",
                         "CpG_cov.fracCpGw3Reads",
                         "CpG_cov.fracCpGw4Reads",
                         "CpG_cov.fracCpGw5Reads",
                         "CpG_cov.fracCpGgt5Reads",
                         "enrich.regions.C",
                         "enrich.regions.G",
                         "enrich.regions.CG",
                         "enrich.genome.C",
                         "enrich.genome.G",
                         "enrich.genome.CG",
                         "enrich.regions.relH",
                         "enrich.genome.relH",
                         "enrich.regions.GoGe",
                         "enrich.genome.GoGe",
                         "enrich.enrichment.score.relH",
                         "enrich.enrichment.score.GoGe"),
             values = c(er$genome,
                        round(sr$maxEstCor[2], 5),
                        round(sr$maxTruCor[2], 5),
                        cr$numberReads,
                        cr$numberReadsWO,
                        round(cr$numberReadsWO / cr$numberReads, 5),
                        length(cr$cov.res),
                        length(cr$cov.res[cr$cov.res < 1]),
                        length(cr$cov.res[cr$cov.res == 1]),
                        length(cr$cov.res[cr$cov.res == 2]),
                        length(cr$cov.res[cr$cov.res == 3]),
                        length(cr$cov.res[cr$cov.res == 4]),
                        length(cr$cov.res[cr$cov.res == 5]),
                        length(cr$cov.res[cr$cov.res > 5]),
                        round(length(cr$cov.res[cr$cov.res < 1]) / length(cr$cov.res), 5),
                        round(length(cr$cov.res[cr$cov.res == 1]) / length(cr$cov.res), 5),
                        round(length(cr$cov.res[cr$cov.res == 2]) / length(cr$cov.res), 5),
                        round(length(cr$cov.res[cr$cov.res == 3]) / length(cr$cov.res), 5),
                        round(length(cr$cov.res[cr$cov.res == 4]) / length(cr$cov.res), 5),
                        round(length(cr$cov.res[cr$cov.res == 5]) / length(cr$cov.res), 5),
                        round(length(cr$cov.res[cr$cov.res > 5]) / length(cr$cov.res), 5),
                        er$regions.C,
                        er$regions.G,
                        er$regions.CG,
                        er$genome.C,
                        er$genome.G,
                        er$genome.CG,
                        round(er$regions.relH, 5),
                        round(er$genome.relH, 5),
                        round(er$regions.GoGe, 5),
                        round(er$genome.GoGe, 5),
                        round(er$enrichment.score.relH, 5),
                        round(er$enrichment.score.GoGe, 5)))
names(QC_MEDIPS.df) = c("QC_type", "metrics", fname)
# QC_MEDIPS.df

write.table(QC_MEDIPS.df, paste0(out_dir, fname, "_QC_MEDIPS.csv"), row.names=F, quote=F, sep = '\t')

R tidyverse BSgenome.Hsapiens.UCSC.hg38 docopt GTools MEDIPS From line 1 of QC/QC_MEDIPS.R

A snakemake workflow to process ATAC-seq data

myargs <- commandArgs(trailingOnly=TRUE)
bamfile <- myargs[1]
species <- myargs[2]

print("loading packages (ATACseqQC, ggplot, etc)...")
suppressPackageStartupMessages(library(ggplot2, quietly=TRUE))
suppressPackageStartupMessages(library(Rsamtools, quietly=TRUE))
suppressPackageStartupMessages(library(ATACseqQC, quietly=TRUE))
suppressPackageStartupMessages(library(ChIPpeakAnno, quietly=TRUE))
suppressPackageStartupMessages(library("GenomicAlignments", quietly=TRUE))

if (species == "mm") {
  suppressPackageStartupMessages(library(TxDb.Mmusculus.UCSC.mm10.knownGene, quietly=TRUE))
  suppressPackageStartupMessages(library(BSgenome.Mmusculus.UCSC.mm10, quietly=TRUE))
  txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene
  bsgenome <- BSgenome.Mmusculus.UCSC.mm10
  genome <- Mmusculus
  print("species is 'mm' using mm10 for analysis")
  ### Note: Everything below is deprecated until I can figure out a way to port a 
  ### static/local package with snakemake
  # Note: phastCons60way was manually curated from GenomicAlignments, built, and installed as an R package
  # score was obtained according to: https://support.bioconductor.org/p/96226/
  # package was built and installed according to: https://www.bioconductor.org/packages/devel/bioc/vignettes/GenomicScores/inst/doc/GenomicScores.html
  # (section 5.1: Building an annotation package from a GScores object)
  #suppressWarnings(suppressPackageStartupMessages(library(GenomicScores, lib.loc="/users/dia6sx/snakeATAC/scripts/", quietly=TRUE)))
  #suppressWarnings(suppressPackageStartupMessages(library(phastCons60way.UCSC.mm10, lib.loc="/users/dia6sx/snakeATAC/scripts/", quietly=TRUE)))
} else if (species == "hs") {
  suppressPackageStartupMessages(library(TxDb.Hsapiens.UCSC.hg38.knownGene, quietly=TRUE))
  suppressPackageStartupMessages(library(BSgenome.Hsapiens.UCSC.hg38, quietly=TRUE))
  txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
  bsgenome <- BSgenome.Hsapiens.UCSC.hg38
  genome <- Hsapiens
  print("species is 'hs' using hg38 for analysis")
} else {
  print(paste("params ERROR: ATACseqQC is not configured to use species =", species))
  print("exiting...")
  quit(status=1)
}

doATACseqQC <- function(bamfile, txdb, bsgenome, genome) {
    # Fragment size distribution
    print(paste("generating output for ",strsplit(basename(bamfile),split='\\.')[[1]][1],"...",sep=""))
    print("calculating Fragment size distribution...")
    bamfile.labels <- gsub(".bam", "", basename(bamfile))
    loc_to_save_figures <- paste(dirname(dirname(bamfile)),"/qc/ATACseqQC",sep="")
    if (file.exists(loc_to_save_figures)) {
        print("Warning: old figures will be overwritten")
    } else {
        dir.create(loc_to_save_figures)
    }
    png_file <- paste(loc_to_save_figures,"/",bamfile.labels,"_fragment_size_distribution.png",sep="")
    png(png_file)
    fragSizeDist(bamfile, bamfile.labels)
    dev.off()

    # Adjust the read start sites
    print("adjusting read start sites...")
    ## bamfile tags to be read in
    possibleTag <- list("integer"=c("AM", "AS", "CM", "CP", "FI", "H0", "H1", "H2", 
                                    "HI", "IH", "MQ", "NH", "NM", "OP", "PQ", "SM",
                                    "TC", "UQ"), 
                    "character"=c("BC", "BQ", "BZ", "CB", "CC", "CO", "CQ", "CR",
                                "CS", "CT", "CY", "E2", "FS", "LB", "MC", "MD",
                                "MI", "OA", "OC", "OQ", "OX", "PG", "PT", "PU",
                                "Q2", "QT", "QX", "R2", "RG", "RX", "SA", "TS",
                                "U2"))
    bamTop100 <- scanBam(BamFile(bamfile, yieldSize = 100),
                     param = ScanBamParam(tag=unlist(possibleTag)))[[1]]$tag
    tags <- names(bamTop100)[lengths(bamTop100)>0]
    ## files will be output into outPath
    ## shift the coordinates of 5'ends of alignments in the bam file
    outPath <- paste(dirname(dirname(bamfile)),"/alignments_shifted", sep="")
    seqinformation <- seqinfo(txdb)
    gal <- readBamFile(bamfile, tag=tags, asMates=TRUE, bigFile=TRUE)
    shiftedBamfile <- file.path(outPath, paste(bamfile.labels,"_shifted.bam",sep=""))
    # check if shifted Bam file exists from previous run
    if (file.exists(shiftedBamfile)) {
        print("Shifted Bamfile found.")
        print("Loading in...")
        gal <- readBamFile(shiftedBamfile, tag=tags, asMates=TRUE, bigFile=TRUE)
        ## This step is mostly for formating so splitBam can
        ## take in bamfile. Implementing shift of 0 bp on positive strand
        ## and 0 bp on negative strand because shifted Bamfile should
        ## already have these shifts
        gal1 <- shiftGAlignmentsList(gal, positive = 0L, negative = 0L)
    } else {
        # shifted bam file does not exist check if
        # old shifted alignments directory exists
        # if so remove and create new one
        if (file.exists(outPath)){
            unlink(outPath,recursive=TRUE)
        }
        dir.create(outPath)
        print("*** creating shifted bam file ***")
        gal1 <- shiftGAlignmentsList(gal, outbam=shiftedBamfile)
    }

    # Promoter/Transcript body (PT) score
    print("calculating Promoter/Transcript body (PT) score...")
    txs <- transcripts(txdb)
    pt <- PTscore(gal1, txs)
    png_file <- paste(loc_to_save_figures,"/",bamfile.labels,"_ptscore.png",sep="")
    png(png_file)
    plot(pt$log2meanCoverage, pt$PT_score, 
        xlab="log2 mean coverage",
        ylab="Promoter vs Transcript",
        main=paste(bamfile.labels,"PT score"))
    dev.off()

    # Nucleosome Free Regions (NFR) score
    print("calculating Nucleosome Free Regions (NFR) score")
    nfr <- NFRscore(gal1, txs)
    png_file <- paste(loc_to_save_figures,"/",bamfile.labels,"_nfrscore.png",sep="")
    png(png_file)
    plot(nfr$log2meanCoverage, nfr$NFR_score, 
        xlab="log2 mean coverage",
        ylab="Nucleosome Free Regions score",
        main=paste(bamfile.labels,"\n","NFRscore for 200bp flanking TSSs",sep=""),
        xlim=c(-10, 0), ylim=c(-5, 5))
    dev.off()

    # Transcription Start Site (TSS) Enrichment Score
    print("calculating Transcription Start Site (TSS) Enrichment score")
    tsse <- TSSEscore(gal1, txs)
    png_file <- paste(loc_to_save_figures,"/",bamfile.labels,"_tss_enrichment_score.png",sep="")
    png(png_file)
    plot(100*(-9:10-.5), tsse$values, type="b", 
        xlab="distance to TSS",
        ylab="aggregate TSS score",
        main=paste(bamfile.labels,"\n","TSS Enrichment score",sep=""))
    dev.off()

    # Split reads, Heatmap and coverage curve for nucleosome positions
    print("splitting reads by fragment length...")
    genome <- genome
    outPath <- paste(dirname(dirname(bamfile)),"/alignments_split", sep="")
    TSS <- promoters(txs, upstream=0, downstream=1)
    TSS <- unique(TSS)
    ## estimate the library size for normalization
    librarySize <- estLibSize(bamfile)
    ## calculate the signals around TSSs.
    NTILE <- 101
    dws <- ups <- 1010
    splitBamfiles <- paste(outPath,"/",c("NucleosomeFree", 
                                             "mononucleosome",
                                             "dinucleosome",
                                             "trinucleosome"),".bam",sep="")
    # check if split Bam files exists from previous run
    if (all(file.exists(splitBamfiles))) {
        print("*** split bam files found! ***")
        print("Loading in...")
        sigs <- enrichedFragments(bamfiles=splitBamfiles,
                                    index=splitBamfiles, 
                                    TSS=TSS,
                                    librarySize=librarySize,
                                    TSS.filter=0.5,
                                    n.tile = NTILE,
                                    upstream = ups,
                                    downstream = dws)
    } else {
        # split bam files do not exist check if
        # old split alignments directory exists
        # if so remove and create new one
        if (file.exists(outPath)){
            unlink(outPath,recursive=TRUE)
        }
        print("*** creating split bam files ***")
        dir.create(outPath)
        ## split the reads into NucleosomeFree, mononucleosome, 
        ## dinucleosome and trinucleosome.
        ## and save the binned alignments into bam files.
        objs <- splitGAlignmentsByCut(gal1, txs=txs, genome=genome, outPath = outPath)
        #objs <- splitBam(bamfile, tags=tags, outPath=outPath,
            #        txs=txs, genome=genome,
            #       conservation=phastCons60way.UCSC.mm10,
            #      seqlev=paste0("chr", c(1:19, "X", "Y")))
        sigs <- enrichedFragments(gal=objs[c("NucleosomeFree", 
                                        "mononucleosome",
                                        "dinucleosome",
                                        "trinucleosome")], 
                                    TSS=TSS,
                                    librarySize=librarySize,
                                    TSS.filter=0.5,
                                    n.tile = NTILE,
                                    upstream = ups,
                                    downstream = dws)
    }
    ## log2 transformed signals
    sigs.log2 <- lapply(sigs, function(.ele) log2(.ele+1))
    ## plot heatmap
    png_file <- paste(loc_to_save_figures,"/",bamfile.labels,"_nucleosome_pos_heatmap.png",sep="")
    png(png_file)
    featureAlignedHeatmap(sigs.log2, reCenterPeaks(TSS, width=ups+dws),
                        zeroAt=.5, n.tile=NTILE)
    dev.off()
    ## get signals normalized for nucleosome-free and nucleosome-bound regions.
    out <- featureAlignedDistribution(sigs, 
                                    reCenterPeaks(TSS, width=ups+dws),
                                    zeroAt=.5, n.tile=NTILE, type="l", 
                                    ylab="Averaged coverage")
    ## rescale the nucleosome-free and nucleosome signals to 0~1
    range01 <- function(x){(x-min(x))/(max(x)-min(x))}
    out <- apply(out, 2, range01)
    png_file <- paste(loc_to_save_figures,"/",bamfile.labels,"_TSS_signal_distribution.png",sep="")
    png(png_file)
    matplot(out, type="l", xaxt="n", 
            xlab="Position (bp)", 
            ylab="Fraction of signal",
            main=paste(bamfile.labels,"\n","TSS signal distribution",sep=""))
    axis(1, at=seq(0, 100, by=10)+1, 
        labels=c("-1K", seq(-800, 800, by=200), "1K"), las=2)
    abline(v=seq(0, 100, by=10)+1, lty=2, col="gray")
    dev.off()

    print("QC Finished.")
    print("Generated QC figures can be found in qc folder under ATACseQC")
    print(paste("*** removing temp files in",outPath,"***"))
    unlink(outPath,recursive=TRUE)
    outPath <- paste(dirname(dirname(bamfile)),"/alignments_shifted", sep="")
    print(paste("*** removing temp files in",outPath,"***"))
    unlink(outPath,recursive=TRUE)
}

doATACseqQC(bamfile, txdb, bsgenome, genome)

R Snakemake ggplot2 BSgenome.Hsapiens.UCSC.hg38 BSgenome.Mmusculus.UCSC.mm10 TxDb.Mmusculus.UCSC.mm10.knownGene TxDb.Hsapiens.UCSC.hg38.knownGene GenomicAlignments Rsamtools ATACseqQC GenomicScores From line 23 of scripts/doATACseqQC.R

Snakemake pipeline for Epicure analyses (0.14.1)

base::message("Loading libraries ... ")
suppressPackageStartupMessages(library("ATACseqQC"))
suppressPackageStartupMessages(library("TxDb.Hsapiens.UCSC.hg38.knownGene"))
suppressPackageStartupMessages(library("BSgenome.Hsapiens.UCSC.hg38"))
suppressPackageStartupMessages(library("phastCons100way.UCSC.hg38"))
suppressPackageStartupMessages(library("MotifDb"))
suppressPackageStartupMessages(library("ChIPpeakAnno"))
suppressPackageStartupMessages(library("Rsamtools"))
base::message("Libraries loaded.")

# base::message("Setting sequence level style...")
# seqlevelsStyle(TxDb.Hsapiens.UCSC.hg38.knownGene) <- "Ensembl"
# seqlevelsStyle(BSgenome.Hsapiens.UCSC.hg38) <- "Ensembl"
# base::message("Database chromosome renamed.")

base::message("Acquiering bam file...")
bamfile <- BamFile(
    file = base::as.character(x = snakemake@input[["bam"]])
)
name <- base::as.character(x = snakemake@params[["name"]])
base::print(bamfile)
base::print(name)
base::message("BamFiles identified")

base::message("Reading bam tags...")
possibleTag <- list(
    "integer" = c(
        "AM", "AS", "CM", "CP", "FI", "H0", "H1",
        "H2", "HI", "IH", "MQ", "NH", "NM", "OP",
        "PQ", "SM", "TC", "UQ"
    ),
    "character" = c(
        "BC", "BQ", "BZ", "CB", "CC", "CO", "CQ", "CR",
        "CS", "CT", "CY", "E2", "FS", "LB", "MC", "MD",
        "MI", "OA", "OC", "OQ", "OX", "PG", "PT", "PU",
        "Q2", "QT", "QX", "R2", "RG", "RX", "SA", "TS", "U2"
    )
)
bamTop100 <- scanBam(
    BamFile(bamfile$path, yieldSize = 100),
    param = ScanBamParam(tag=unlist(possibleTag))
)[[1]]$tag
tags <- names(bamTop100)[lengths(bamTop100) > 0]
base::print(tags)
base::message("Tags Acquired")

base::message("Retrieving sequence level informations...")
seqlev <- as.vector(
    sapply(c(1:22, "X", "Y"), function(chrom) paste0("chr", chrom))
)
seqinformation <- seqinfo(TxDb.Hsapiens.UCSC.hg38.knownGene)
which <- as(seqinformation[seqlev], "GRanges")
base::print(which)
base::message("Sequences retrived")

base::message("Loading bam file...")
bamdata <- readBamFile(
    bamFile = bamfile$path,
    bigFile = TRUE,
    asMates = TRUE,
    tags = tags,
    which = which,
)
base::message("Bam file loaded")

base::message("Shifting bam...")
shiftedBamfile <- base::as.character(x = snakemake@output[["shifted"]])
shiftedBamdir <- base::dirname(shiftedBamfile)
print(shiftedBamdir)
base::dir.create(
    path = shiftedBamdir,
    recursive = TRUE
)
bamdata <- shiftGAlignmentsList(
    gal = bamdata,
    outbam = shiftedBamfile
)
print(bamdata)
base::message("Shift OK")

base::message("Acquiering motif...")
motif_name <- base::as.character(x = snakemake@params[["motif"]])
motif <- query(MotifDb, c(motif_name))
motif <- as.list(motif)
print(motif[[1]], digits = 2)
base::message("Motif retrieved.")

base::message("plot Footprints...")
genome <- Hsapiens
print(genome)

png(
    filename = snakemake@output[["png"]],
    width = 1024,
    height = 768,
    units = "px"
)
sigs <- factorFootprints(
    shiftedBamfile,
    pfm = motif[[1]],
    genome = genome,
    min.score = "90%",
    seqlev = c(1:22, "X", "Y"),
    upstream = 100,
    downstream = 100
)
dev.off()
base::message("Done.")

base::save.image(file = base::as.character(x = snakemake@output[["rda"]]))
base::message("Process over")

R BSgenome.Hsapiens.UCSC.hg38 TxDb.Hsapiens.UCSC.hg38.knownGene Rsamtools ChIPpeakAnno ATACseqQC MotifDb From line 1 of factorfootprints/factor_footprints.R

Workflow Steps and Code Snippets