BioWorkflows: Reproducible reanalysis of a combined ChIP-Seq & RNA-Seq data set

MEMORY_REQUIREMENTS_GB = {
    'star': 30,
    'hisat2': 5,
    'rnaseq_count': 3,
    'salmon': 5,
    'kallisto': 5,
    'bowtie2': 5,
    'macs_callpeak': 30,
    'epic_callpeak': 20,
    'greylist': 30,
    'chipseq_count_windows': 60,
    'chipseq_count_bins': 20,
    'chipseq_count_regions': 3,
    'rnaseq_analyze': 10,
    'chipseq_analyze': 40,
}

Python Bowtie 2 Salmon kallisto HISAT2 From line 1 of master/mem_requirements.py

shell: '''inkscape {input:q} --export-png={output:q} --export-dpi=300'''

SnakeMake From line 32 of master/rulegraph.Snakefile

shell: '''inkscape {input:q} --export-pdf={output:q} --export-dpi=300'''

SnakeMake From line 37 of master/rulegraph.Snakefile

run:
    if is_target_rule(params.target_path):
        rule = get_rule(params.target_path)
        if len(rule.output):
            real_targets = rule.output
        else:
            real_targets = rule.input
    else:
        real_targets = [params.target_path]
    shell('''
    snakemake --nolock -f --dag {real_targets:q} | \
    dot -Grankdir=LR -Tsvg > {output:q}
    ''')

SnakeMake Snakemake From line 43 of master/rulegraph.Snakefile

run:
    if is_target_rule(params.target_path):
        rule = get_rule(params.target_path)
        if len(rule.output):
            real_targets = rule.output
        else:
            real_targets = rule.input
    else:
        real_targets = [params.target_path]
    shell('''
    snakemake --nolock -f --rulegraph {real_targets:q} | \
    dot -Grankdir=LR -Tsvg > {output:q}
    ''')

SnakeMake Snakemake From line 61 of master/rulegraph.Snakefile

suppressPackageStartupMessages({
    library(getopt)
    library(optparse)
    library(stringr)
    library(magrittr)
    library(assertthat)
    library(rctutils)
})

get_options <- function(opts) {
    optlist <- list(
        make_option(c("-s", "--samplemeta-file"), metavar = "FILENAME.RDS", type = "character",
                    help = "(REQUIRED) RDS/RData/xlsx/csv file containing a table of sample metadata. Any existing rownames will be replaced with the values in the sample ID  column (see below)."),
        make_option(c("-c", "--sample-id-column"), type = "character", default = "Sample",
                    help = "Sample metadata column name that holds the sample IDs. These will be substituted into '--bam-file-pattern' to determine the BAM file names."),
        make_option(c("-f", "--filter-sample-ids"), type = "character",
                    help = "Comma-separated list of sample IDs. If this options is provided, only the specified sample IDs will be used."),
        make_option(c("-p", "--bam-file-pattern"), metavar = "PATTERN", type = "character",
                    help = "(REQUIRED) Format string to convert sample IDs into BAM file paths. This should contain the string '{SAMPLE}' wherever the sample ID should be substituted (this can occur multiple times),. Example: 'bam_files/Sample_{SAMPLE}/Aligned.bam"),
        make_option(c("-t", "--targets"), metavar = "FILENAME.RDS", type = "character",
                    help = "(REQUIRED) File specifying target genomic positions around which reads should be counted. This can be a BED file, GFF file, narrowPeak file, R data file containing a GRanges object, or csv file that can be converted to a GRanges object. If the regions have associated annotations, then a GRanges in an R data file is the recommended format. Generally the ranges specified should each be only a single base pair, which will be used as the center of the neighborhood. If any ranges are longer than 1bp, the neighborhoods will be formed around the 5-prime ends (or, for ranges with no strand information, the end closest to the beginning of the chromosome)."),
        make_option(c("-u", "--upstream-neighborhood"), type = "character", default = "5kbp", metavar = "5kbp",
                    help = "How far upstream to extend the neighborhood around each target."),
        make_option(c("-d", "--downstream-neighborhood"), type = "character", default = "5kbp", metavar = "5kbp",
                    help = "How far downstream to extend the neighborhood around each target."),
        make_option(c("-w", "--window-width"), type = "character", default = "1kbp", metavar = "1kbp",
                    help = "Width of windows in which to count."),
        make_option(c("--window-spacing"), metavar = "BP", type = "character",
                    help = "Spacing between the start points of consecutive windows. By default, this is identical to the window width, so that the windows exactly tile the neighborhood. Changing this results in either gapped windows (spacing > width) or overlapping windows (spacing < width)."),
        make_option(c("--initial-window-offset"), type = "character", default = "0bp", metavar = "0bp",
                    help = "Offset of each neighborhood's central window relative to each target. The default of 0 means that the central window will be centered directly around the target itself. Negative values place the center of the window further upstream (in the 5-prime direction), while positive values place it downstream (in the 3-prime direction). If you want a border between adjacent windows to fall on the target, set this to half the window width."),
        make_option(c("-o", "--output-file"), metavar = "FILENAME.RDS", type = "character",
                    help = "(REQUIRED) Output file name. The SummarizedExperiment object containing the counts will be saved here using saveRDS, so it should end in '.RDS'."),
        make_option(c("-b", "--expected-bam-files"), metavar = "BAMFILE1,BAMFILE2,...", type = "character",
                    help = "Comma-separated list of bam file names expected to be used as input. This argument is optional, but if it is provided, it will be checked against the list of files determined from '--samplemeta-file' and '--bam-file-pattern', and an error will be raised if they don't match exactly."),
        make_option(c("-e", "--read-extension"), type = "character", default = "100bp", metavar = "100bp",
                    help = "Assumed fragment length of unpaired reads. Each single read will be assumed to represent a DNA fragment extending this far from its 5 prime end, regardless of the actual read length. (Mated reads pairs already define a fragment length and are unaffected by this option.) Note that each read is counted into the window that contains the midpoint of the represented fragment."),
        make_option(c("-x", "--blacklist"), metavar = "FILENAME.bed", type = "character",
                    help = "File describing blacklist regions to be excluded from the analysis. Windows that overlap these regions will have their counts set to NA. This can be a BED file, GFF file, R data file containing a GRanges object, or csv file that can be converted to a GRanges object."),
        make_option(c("-a", "--blacklist-action"), metavar = "ACTION", type = "character", default = "mark",
                    help = "What action to take on windows that overlap blacklisted regions. Options are 'mark', 'setNA', and 'discard'. The default, 'mark', adds an additional logical column to the rowData of the output named 'blacklist' that is TRUE for windows overlapping the blacklist and FALSE for the rest. 'setNA' additionally sets the read count for blacklisted windows to NA. 'discard' throws away any blacklisted windows, so that they will not be present at all in the output."),
        make_option(c("-j", "--threads"), metavar = "N", type = "integer", default = 1,
                    help = "Number of threads to use"))
    progname <- na.omit(c(get_Rscript_filename(), "chipseq-count-neighborhoods.R"))[1]
    parser <- OptionParser(
        usage = "Usage: %prog [options] -s SAMPLEMETA.RDS -p PATTERN -t TARGETS.RDS -o SUMEXP.RDS",
        description = "Count ChIP-seq reads in neighborhoods around a set of specified genomic positions.",
        option_list = optlist,
        add_help_option = TRUE,
        prog = progname,
        epilogue = "A \"neighborhood\" consists of a set of windows at regular offsets relative to a genomic position. For example, windows every 200 bp from 5kbp upstream to 2kb downstream. Each window is annotated with an 'offset' column that indicates the distance from the center of that window to the specified genomic position. Note that the windows at the edges of the neighborhood will likely extend past the specified distances upstream and downstream, since these distances refer to the center of the window, not the outer edge. In addition, note that not every neighborhood is guaranteed to contain a complete \"set\" of windows, since it may extend off the edge of a chromosome,

Note that all base pair sizes (window width/spacing and read extension) may have a suffix of 'bp', 'kbp', 'mbp', or 'tbp'. For example, 10kbp = 10000.")

    cmdopts <- parse_args(parser, opts)
    ## Ensure that all required arguments were provided
    required.opts <- c("samplemeta-file", "output-file", "bam-file-pattern", "targets")
    missing.opts <- setdiff(required.opts, names(cmdopts))
    if (length(missing.opts) > 0) {
        stop(str_c("Missing required arguments: ", deparse(missing.opts)))
    }

    ## Split list arguments
    for (i in c("filter-sample-ids", "expected-bam-files")) {
        if (i %in% names(cmdopts)) {
            cmdopts[[i]] %<>% str_split(",") %>% unlist
        }
    }

    ## Convert bp args to numbers
    for (i in c("upstream-neighborhood", "downstream-neighborhood", "window-width", "window-spacing", "initial-window-offset", "read-extension")) {
        if (i %in% names(cmdopts)) {
            cmdopts[[i]] %<>% parse_bp
        }
    }

    if (is.null(cmdopts[["window-spacing"]])) {
        cmdopts[["window-spacing"]] <- cmdopts[["window-width"]]
    }

    ## Validate blacklist action
    if ("blacklist-action" %in% names(cmdopts)) {
        cmdopts[["blacklist-action"]] %<>%
            match_arg(c("mark", "setNA", "discard"), arg_name = "--blacklist-action", ignore.case = TRUE)
    }

    cmdopts$threads %<>% round
    assert_that(cmdopts$threads >= 1)

    cmdopts$help <- NULL

    ## Replace dashes with underscores so that all options can easily
    ## be accessed by "$"
    cmdopts %>% setNames(chartr("-", "_", names(.)))
}

## Terminate early on argument-processing errors
invisible(get_options(commandArgs(TRUE)))

suppressPackageStartupMessages({
    library(dplyr)
    library(glue)
    library(future)
    library(GenomicRanges)
    library(SummarizedExperiment)
    library(GenomicAlignments)
    library(rlang)
    library(forcats)
    library(csaw)
})

options(future.globals.maxSize = 4 * 1024^3)

## cmdopts <- list(
##     samplemeta_file = "saved_data/samplemeta-ChIPSeq.RDS",
##     sample_id_column = "SRA_run",
##     bam_file_pattern = "aligned/chipseq_bowtie2_hg38.analysisSet/{SAMPLE}/Aligned.bam",
##     targets = "saved_data/tss_shoal_hg38.analysisSet_ensembl.85.RDS",
##     upstream_neighborhood = 5000,
##     downstream_neighborhood = 5000,
##     window_width = 500,
##     initial_window_offset = 0,
##     output_file = "saved_data/tss-neighborhood-counts_hg38.analysisSet_ensembl.85_5kbp-radius_500bp-windows_147bp-reads.RDS",
##     read_extension = 147,
##     blacklist = "saved_data/ChIPSeq-merged-blacklist.bed",
##     blacklist_action = "mark",
##     threads = 4,
##     window_spacing = 500)

{
    cmdopts <- get_options(commandArgs(TRUE))
    ## TODO: Don't use setwd in this or any other script
    tryCatch(setwd(file.path(dirname(na.omit(get_Rscript_filename())), "..")),
             error = function(...) tsmsg("WARNING: Could not determine script path. Ensure that you are already in the correct directory."))

    tsmsg("Args:")
    print_var_vector(cmdopts)

    if (cmdopts$threads > 1) {
        use_futures("multicore", workers = cmdopts$threads, quiet = TRUE)
    } else {
        use_futures("sequential", quiet = TRUE)
        register(SerialParam())
    }
    tsmsg(glue("Using {cmdopts$threads} cores."))

    tsmsg(glue("Assuming a fragment size of {format_bp(cmdopts$read_extension)} for unpaired reads."))

    tsmsg("Loading sample data")

    sample_table <- readRDS(cmdopts$samplemeta_file) %>%
        ## Compute full path to BAM file
        mutate(bam_file = glue(cmdopts$bam_file_pattern, SAMPLE = .[[cmdopts$sample_id_column]])) %>%
        ## Ensure that days_after_activation is a factor and can't be
        ## interpreted as a numeric
        mutate(days_after_activation = days_after_activation %>%
                   factor %>% fct_relabel(~str_c("Day", .))) %>%
        rename(time_point = days_after_activation)

    if (!is.null(cmdopts$filter_sample_ids)) {
        tsmsg("Selecting only ", length(cmdopts$filter_sample_ids), " specified samples.")
        assert_that(all(cmdopts$filter_sample_ids %in% sample_table[[cmdopts$sample_id_column]]))
        sample_table %<>% .[.[[cmdopts$sample_id_column]] %in% cmdopts$filter_sample_ids,]
    }

    assert_that(all(file.exists(sample_table$bam_file)))

    if ("expected_bam_files" %in% names(cmdopts)) {
        tryCatch({
            assert_that(setequal(samplemeta$bam_file, cmdopts$expected_bam_files))
            tsmsg("Sample metadata contains all expected bam files")
        }, error = function(...) {
            unexpected_existing <- setdiff(samplemeta$bam_file, cmdopts$expected_bam_files)
            expected_but_missing <- setdiff(cmdopts$expected_bam_files, samplemeta$bam_file)
            if (length(unexpected_existing) > 0) {
                tsmsg(glue("Got unexpected bam files: {deparse(unexpected_existing)}"))
            }
            if (length(expected_but_missing) > 0) {
                tsmsg(glue("Didn't find expected bam files: {deparse(expected_but_missing)}"))
            }
            stop("Bam file list was not as expected")
        })
    }

    tsmsg("Loading target positions")
    targets <- read_regions(cmdopts$targets)
    assert_that(is(targets, "GRanges"))
    if (any(strand(targets) == "*")) {
        warning("Some targets have no strand information, and will be treated as being on the plus strand.")
    }
    ## Reduce targets to 5-prime end only
    targets %<>% resize(width = 1, fix = "start")

    blacklist_regions <- GRanges()
    if (!is.null(cmdopts$blacklist)) {
        tsmsg("Loading blacklist regions")
        blacklist_regions <- read_regions(cmdopts$blacklist)
        assert_that(is(blacklist_regions, "GRanges"))
        ## Blacklist applies to both strands
        strand(blacklist_regions) <- "*"
    }

    nhood_offsets <- cmdopts %$% c(
        rev(seq(from = initial_window_offset,
                to = -upstream_neighborhood,
                by = -window_spacing)),
        seq(from = initial_window_offset + window_spacing,
            to = downstream_neighborhood,
            by = window_spacing))

    assert_that(is_named(targets))
    nhood_windows <- rep(targets, each = length(nhood_offsets))
    nhood_windows$offset <- rep(nhood_offsets, length.out = length(nhood_windows))
    nhood_windows %<>%
        shift(.$offset * ifelse(strand(.) == "-", -1, 1)) %>%
        resize(width = cmdopts$window_width, fix = "center")
    ## Add offset to window names
    names(nhood_windows) %<>% str_c(sprintf("%+ibp", nhood_windows$offset))

    if (length(blacklist_regions) > 0) {
        blacklisted <- overlapsAny(nhood_windows, blacklist_regions, ignore.strand = TRUE)
        nhood_windows$blacklist <- blacklisted
        if (cmdopts$blacklist_action == "discard") {
            tsmsg("Discarding blacklisted windows")
            nhood_windows %<>% .[!.$blacklisted]
        } else if (cmdopts$blacklist_action %in% c("mark", "setNA")) {
            ## Makring is handled above, and setNA will be handled
            ## later
            NULL
        } else {
            stop(glue("Unknown blacklist action '{cmdopts$blacklist_action}'"))
        }
    } else {
        ## We always add the blacklist column, but without a blacklist
        ## it is simply false for everything
        nhood_windows$blacklist <- FALSE
    }

    tsmsg(glue("Counting reads in neighborhoods around {length(targets)} regions in {nrow(sample_table)} samples."))
    tsmsg(glue("Neighborhoods consist of {length(nhood_offsets)} windows of width {format_bp(cmdopts$window_width)} tiled from {format_bp(cmdopts$upstream_neighborhood)} upstream to {format_bp(cmdopts$downstream_neighborhood)} downstream."))

    param <- readParam(BPPARAM=bpparam())
    rcounts <- regionCounts(
        sample_table$bam_file, regions=nhood_windows,
        # See ?windowCounts for explanation of "ext=list(...)"
        ext=list(rep(cmdopts$read_extension, nrow(sample_table)), 1),
        param = param)

    ## Add sample metadata to colData in front of mapping stats
    colData(rcounts)$bam.files <- NULL
    colData(rcounts) %<>% cbind(sample_table, .)
    colnames(rcounts) <- sample_table[[cmdopts$sample_id_column]]

    ## Set blacklisted window counts to NA, if requested
    if (cmdopts$blacklist_action == "setNA") {
        bl <- rowRanges(rcounts)$blacklist
        assay(rcounts, "counts")[bl,] <- NA
    }

    ## Save command and options in the metadata
    metadata(rcounts)$cmd.name <- na.omit(c(get_Rscript_filename(), "chipseq-count-neighborhoods.R"))[1]
    metadata(rcounts)$cmd.opts <- cmdopts

    tsmsg("Saving output file")
    saveRDS(rcounts, cmdopts$output_file)
    tsmsg("Finished.")
}

R dplyr stringr optparse magrittr GenomicRanges GLUE forcats SummarizedExperiment assertthat future GenomicAlignments getopt rctutils csaw rlang From line 3 of scripts/chipseq-count-neighborhoods.R

library(getopt)
library(optparse)
library(stringr)
library(magrittr)
library(assertthat)
library(rctutils)

get_options <- function(opts) {
    optlist <- list(
        make_option(c("-s", "--samplemeta-file"), metavar = "FILENAME.RDS", type = "character",
                    help = "(REQUIRED) RDS/RData/xlsx/csv file containing a table of sample metadata. Any existing rownames will be replaced with the values in the sample ID  column (see below)."),
        make_option(c("-c", "--sample-id-column"), type = "character", default = "Sample",
                    help = "Sample metadata column name that holds the sample IDs. These will be substituted into '--bam-file-pattern' to determine the BAM file names."),
        make_option(c("-f", "--filter-sample-ids"), type = "character",
                    help = "Comma-separated list of sample IDs. If this options is provided, only the specified sample IDs will be used."),
        make_option(c("-p", "--bam-file-pattern"), metavar = "PATTERN", type = "character",
                    help = "(REQUIRED) Format string to convert sample IDs into BAM file paths. This should contain the string '{SAMPLE}' wherever the sample ID should be substituted (this can occur multiple times),. Example: 'bam_files/Sample_{SAMPLE}/Aligned.bam"),
        make_option(c("-r", "--regions"), metavar = "FILENAME.RDS", type = "character",
                    help = "(REQUIRED) File specifying regions in which reads should be counted. This can be a BED file, GFF file, narrowPeak file, R data file containing a GRanges object, or csv file that can be converted to a GRanges object. If the regions have associated annotations, then a GRanges in an R data file is the recommended format."),
        make_option(c("-o", "--output-file"), metavar = "FILENAME.RDS", type = "character",
                    help = "(REQUIRED) Output file name. The SummarizedExperiment object containing the counts will be saved here using saveRDS, so it should end in '.RDS'."),
        make_option(c("-b", "--expected-bam-files"), metavar = "BAMFILE1,BAMFILE2,...", type = "character",
                    help = "Comma-separated list of bam file names expected to be used as input. This argument is optional, but if it is provided, it will be checked against the list of files determined from '--samplemeta-file' and '--bam-file-pattern', and an error will be raised if they don't match exactly."),
        make_option(c("-e", "--read-extension"), type = "character", default = "100bp",
                    help = "Assumed fragment length of reads. Each read will be assumed to represent a DNA fragment extending this far from its 5 prime end, regardless of the actual read length."),
        make_option(c("-x", "--blacklist"), metavar = "FILENAME.bed", type = "character",
                    help = "File describing blacklist regions to be excluded from the analysis. Reads that overlap these regions will be discarded without counting them toward any region. This can be a BED file, GFF file, R data file containing a GRanges object, or csv file that can be converted to a GRanges object."),
        make_option(c("-j", "--threads"), metavar = "N", type = "integer", default = 1,
                    help = "Number of threads to use"))
    progname <- na.omit(c(get_Rscript_filename(), "chipseq-count-windows.R"))[1]
    parser <- OptionParser(
        usage = "Usage: %prog [options] -s SAMPLEMETA.RDS -p PATTERN -r REGIONS.RDS -o SUMEXP.RDS",
        description = "Count ChIP-seq reads a set of specified regions",
        option_list = optlist,
        add_help_option = TRUE,
        prog = progname,
        epilogue = "Note that all base pair sizes (window width/spacing and read extension) may have a suffix of 'bp', 'kbp', 'mbp', or 'tbp'. For example, 10kbp = 10000.")

    cmdopts <- parse_args(parser, opts)
    ## Ensure that all required arguments were provided
    required.opts <- c("samplemeta-file", "output-file", "bam-file-pattern", "regions")
    missing.opts <- setdiff(required.opts, names(cmdopts))
    if (length(missing.opts) > 0) {
        stop(str_c("Missing required arguments: ", deparse(missing.opts)))
    }

    ## Split list arguments
    for (i in c("filter-sample-ids", "expected-bam-files")) {
        if (i %in% names(cmdopts)) {
            cmdopts[[i]] %<>% str_split(",") %>% unlist
        }
    }

    ## Convert bp args to numbers
    for (i in c("read-extension")) {
        cmdopts[[i]] %<>% parse_bp
    }

    cmdopts$threads %<>% round
    assert_that(cmdopts$threads >= 1)

    cmdopts$help <- NULL

    ## Replace dashes with underscores so that all options can easily
    ## be accessed by "$"
    cmdopts %>% setNames(chartr("-", "_", names(.)))
}

## Terminate early on argument-processing errors
invisible(get.options(commandArgs(TRUE)))

library(dplyr)
library(glue)
library(future)
library(GenomicRanges)
library(SummarizedExperiment)
library(csaw)
library(forcats)

## cmdopts <- list(
##     samplemeta_file = "saved_data/samplemeta-ChIPSeq.RDS",
##     sample_id_column = "SRA_run",
##     bam_file_pattern = "aligned/chipseq_bowtie2_hg38.analysisSet/{SAMPLE}/Aligned.bam",
##     regions = "saved_data/promoter-regions_hg38.analysisSet_knownGene_2.5kbp.RDS",
##     output_file = "saved_data/promoter-counts_hg38.analysisSet_knownGene_2.5kbp-radius_147bp-reads.RDS",
##     read_extension = 147,
##     blacklist = "saved_data/ChIPSeq-merged-blacklist.bed",
##     threads = 4)

{
    cmdopts <- get.options(commandArgs(TRUE))
    tryCatch(setwd(file.path(dirname(na.omit(get_Rscript_filename())), "..")),
             error = function(...) tsmsg("WARNING: Could not determine script path. Ensure that you are already in the correct directory."))

    tsmsg("Args:")
    print_var_vector(cmdopts)

    if (cmdopts$threads > 1) {
        use_futures("multicore", workers = cmdopts$threads, quiet = TRUE)
    } else {
        use_futures("sequential", quiet = TRUE)
    }
    tsmsg(glue("Using {cmdopts$threads} cores."))

    tsmsg(glue("Assuming a fragment size of {format_bp(cmdopts$read_extension)} for unpaired reads."))

    tsmsg("Loading sample data")

    sample_table <- readRDS(cmdopts$samplemeta_file) %>%
        ## Compute full path to BAM file
        mutate(bam_file = glue(cmdopts$bam_file_pattern, SAMPLE = .[[cmdopts$sample_id_column]])) %>%
        ## Ensure that days_after_activation is a factor and can't be
        ## interpreted as a numeric
        mutate(days_after_activation = days_after_activation %>%
                   factor %>% fct_relabel(~str_c("Day", .))) %>%
        rename(time_point = days_after_activation)

    if (!is.null(cmdopts$filter_sample_ids)) {
        tsmsg("Selecting only ", length(cmdopts$filter_sample_ids), " specified samples.")
        assert_that(all(cmdopts$filter_sample_ids %in% sample_table[[cmdopts$sample_id_column]]))
        sample_table %<>% .[.[[cmdopts$sample_id_column]] %in% cmdopts$filter_sample_ids,]
    }

    assert_that(all(file.exists(sample_table$bam_file)))

    if ("expected_bam_files" %in% names(cmdopts)) {
        tryCatch({
            assert_that(setequal(samplemeta$bam_file, cmdopts$expected_bam_files))
            tsmsg("Sample metadata contains all expected bam files")
        }, error = function(...) {
            unexpected_existing <- setdiff(samplemeta$bam_file, cmdopts$expected_bam_files)
            expected_but_missing <- setdiff(cmdopts$expected_bam_files, samplemeta$bam_file)
            if (length(unexpected_existing) > 0) {
                tsmsg(glue("Got unexpected bam files: {deparse(unexpected_existing)}"))
            }
            if (length(expected_but_missing) > 0) {
                tsmsg(glue("Didn't find expected bam files: {deparse(expected_but_missing)}"))
            }
            stop("Bam file list was not as expected")
        })
    }

    tsmsg("Loading regions")
    target_regions <- read_regions(cmdopts$regions)
    assert_that(is(target_regions, "GRanges"))
    ## Analysis is not stranded
    strand(target_regions) <- "*"

    blacklist_regions <- GRanges()
    if (!is.null(cmdopts$blacklist)) {
        tsmsg("Loading blacklist regions")
        blacklist_regions <- read_regions(cmdopts$blacklist)
        assert_that(is(blacklist_regions, "GRanges"))
        ## Blacklist applies to both strands
        strand(blacklist_regions) <- "*"
    }
    rparam <- readParam(discard = blacklist_regions)

    tsmsg(glue("Counting reads in {length(target_regions)} regions in {nrow(sample_table)} samples."))
    if (cmdopts$threads > 1) {
        rCountsFun <- regionCountsParallel
    } else {
        rCountsFun <- regionCounts
    }
    rcounts <- rCountsFun(
        sample_table$bam_file, regions=target_regions,
        ext=cmdopts$read_extension, param=rparam)

    ## Add sample metadata to colData in front of mapping stats
    colData(rcounts)$bam.files <- NULL
    colData(rcounts) %<>% cbind(sample_table, .)
    colnames(rcounts) <- sample_table[[cmdopts$sample_id_column]]

    ## Save command and options in the metadata
    metadata(sexp)$cmd_name <- na.omit(c(get_Rscript_filename(), "chipseq-count-regions.R"))[1]
    metadata(sexp)$cmd_opts <- cmdopts

    tsmsg("Saving output file")
    saveRDS(rcounts, cmdopts$output_file)
    tsmsg("Finished.")
}

R dplyr stringr optparse magrittr GenomicRanges GLUE forcats SummarizedExperiment assertthat future getopt rctutils csaw From line 3 of scripts/chipseq-count-regions.R

suppressMessages({
    library(getopt)
    library(optparse)
    library(stringr)
    library(glue)
    library(magrittr)
    library(GenomicRanges)
    library(SummarizedExperiment)
    library(dplyr)
    library(csaw)
    library(assertthat)
    library(forcats)
    library(rctutils)
    library(future)
})

get_options <- function(opts) {
    optlist <- list(
        make_option(c("-s", "--samplemeta-file"), metavar = "FILENAME.RDS", type = "character",
                    help = "(REQUIRED) RDS/RData/xlsx/csv file containing a table of sample metadata. Any existing rownames will be replaced with the values in the sample ID  column (see below)."),
        make_option(c("-c", "--sample-id-column"), type = "character", default = "Sample",
                    help = "Sample metadata column name that holds the sample IDs. These will be substituted into '--bam-file-pattern' to determine the BAM file names."),
        make_option(c("-f", "--filter-sample-ids"), type = "character",
                    help = "Comma-separated list of sample IDs. If this options is provided, only the specified sample IDs will be used."),
        make_option(c("-p", "--bam-file-pattern"), metavar = "PATTERN", type = "character",
                    help = "(REQUIRED) Format string to convert sample IDs into BAM file paths. This should contain the string '{SAMPLE}' wherever the sample ID should be substituted (this can occur multiple times),. Example: 'bam_files/Sample_{SAMPLE}/Aligned.bam"),
        make_option(c("-o", "--output-file"), metavar = "FILENAME.RDS", type = "character",
                    help = "(REQUIRED) Output file name. The SummarizedExperiment object containing the counts will be saved here using saveRDS, so it should end in '.RDS'."),
        make_option(c("-b", "--expected-bam-files"), metavar = "BAMFILE1,BAMFILE2,...", type = "character",
                    help = "Comma-separated list of bam file names expected to be used as input. This argument is optional, but if it is provided, it will be checked against the list of files determined from '--samplemeta-file' and '--bam-file-pattern', and an error will be raised if they don't match exactly."),
        make_option(c("-w", "--window-width"), type = "character", default = "150bp",
                    help = "Width of windows in which to count."),
        make_option(c("--window-spacing"), metavar = "BP", type = "character",
                    help = "Spacing between the start points of consecutive windows. By default, this is identical to the window width, so that the windows exactly tile the genome. Changing this results in either gapped windows (spacing > width) or overlapping windows (spacing < width)."),
        make_option(c("-e", "--read-extension"), type = "character", default = "100bp",
                    help = "Assumed fragment length of reads. Each read will be assumed to represent a DNA fragment extending this far from its 5 prime end, regardless of the actual read length."),
        make_option(c("-x", "--blacklist"), metavar = "FILENAME.bed", type = "character",
                    help = "File describing blacklist regions to be excluded from the analysis. Reads that overlap these regions will be discarded without counting them toward any window. This can be a BED file, GFF file, R data file containing a GRanges object, or csv file that can be converted to a GRanges object."),
        make_option(c("--bin"), action = "store_true", default = FALSE,
                    help = "Run in bin mode, where each read is counted into exactly one bin."),
        make_option(c("-j", "--threads"), metavar = "N", type = "integer", default = 1,
                    help = "Number of threads to use"))
    progname <- na.omit(c(get_Rscript_filename(), "chipseq-count-windows.R"))[1]
    parser <- OptionParser(
        usage = "Usage: %prog [options] -s SAMPLEMETA.RDS -p PATTERN -w WSIZE -e READEXT [ -s WSPACE ] -o SUMEXP.RDS",
        description = "Do window counting across the genome for ChIP-Seq data",
        option_list = optlist,
        add_help_option = TRUE,
        prog = progname,
        epilogue = "Note that all base pair sizes (window width/spacing and read extension) may have a suffix of 'bp', 'kbp', 'mbp', or 'tbp'. For example, 10kbp = 10000")

    cmdopts <- parse_args(parser, opts)
    ## Ensure that all required arguments were provided
    required.opts <- c("samplemeta-file", "output-file", "bam-file-pattern")
    missing.opts <- setdiff(required.opts, names(cmdopts))
    if (length(missing.opts) > 0) {
        stop(str_c("Missing required arguments: ", deparse(missing.opts)))
    }

    ## Split list arguments
    for (i in c("filter-sample-ids", "expected-bam-files")) {
        if (i %in% names(cmdopts)) {
            cmdopts[[i]] %<>% str_split(",") %>% unlist
        }
    }

    if (! "window-spacing" %in% names(cmdopts)) {
        cmdopts[["window-spacing"]] <- cmdopts[["window-width"]]
    }
    ## Convert bp args to numbers
    for (i in c("window-width", "window-spacing", "read-extension")) {
        cmdopts[[i]] %<>% parse_bp
    }

    cmdopts$threads %<>% round
    assert_that(cmdopts$threads >= 1)

    cmdopts$help <- NULL

    ## Replace dashes with underscores so that all options can easily
    ## be accessed by "$"
    cmdopts %>% setNames(chartr("-", "_", names(.)))
}

## Terminate early on argument-processing errors
invisible(get_options(commandArgs(TRUE)))

{
    cmdopts <- get_options(commandArgs(TRUE))
    ## TODO: Eliminate all setwd
    tryCatch(setwd(file.path(dirname(na.omit(get_Rscript_filename())), "..")),
             error = function(...) tsmsg("WARNING: Could not determine script path. Ensure that you are already in the correct directory."))

    tsmsg("Args:")
    print_var_vector(cmdopts)

    if (cmdopts$threads > 1) {
        use_futures("multicore", workers = cmdopts$threads, quiet = TRUE)
    } else {
        use_futures("sequential", quiet = TRUE)
    }
    tsmsg(glue("Using {cmdopts$threads} cores."))

    if (cmdopts$window_width == cmdopts$window_spacing) {
        tsmsg("Using a window size and spacing of ", format_bp(cmdopts$window_width), ".")
    } else {
        tsmsg("Using a window size of ", format_bp(cmdopts$window_width),
              " and a spacing of ", format_bp(cmdopts$window_spacing), ".")
    }
    ## Fragment size is not used for bins
    if (!cmdopts$bin) {
        tsmsg("Assuming a fragment size of ", format_bp(cmdopts$read_extension))
    }

    tsmsg("Loading sample data")

    sample_table <- readRDS(cmdopts$samplemeta_file) %>%
        ## Compute full path to BAM file
        mutate(bam_file = glue(cmdopts$bam_file_pattern, SAMPLE = .[[cmdopts$sample_id_column]])) %>%
        ## Ensure that days_after_activation is a factor and can't be
        ## interpreted as a numeric
        mutate(days_after_activation = days_after_activation %>%
                   factor %>% fct_relabel(~str_c("Day", .))) %>%
        rename(time_point = days_after_activation)

    if (!is.null(cmdopts$filter_sample_ids)) {
        tsmsg("Selecting only ", length(cmdopts$filter_sample_ids), " specified samples.")
        assert_that(all(cmdopts$filter_sample_ids %in% sample.table[[cmdopts$sample_id_column]]))
        sample.table %<>% .[.[[cmdopts$sample_id_column]] %in% cmdopts$filter_sample_ids,]
    }

    assert_that(all(file.exists(sample.table$bam_file)))

    if ("expected_bam_files" %in% names(cmdopts)) {
        tryCatch({
            assert_that(setequal(samplemeta$bam_file, cmdopts$expected_bam_files))
            tsmsg("Sample metadata contains all expected bam files")
        }, error = function(...) {
            unexpected_existing <- setdiff(samplemeta$bam_file, cmdopts$expected_bam_files)
            expected_but_missing <- setdiff(cmdopts$expected_bam_files, samplemeta$bam_file)
            if (length(unexpected_existing) > 0) {
                tsmsg(glue("Got unexpected bam files: {deparse(unexpected_existing)}"))
            }
            if (length(expected_but_missing) > 0) {
                tsmsg(glue("Didn't find expected bam files: {deparse(expected_but_missing)}"))
            }
            stop("Bam file list was not as expected")
        })
    }

    blacklist_regions <- GRanges()
    if (!is.null(cmdopts$blacklist)) {
        tsmsg("Loading blacklist regions")
        blacklist_regions <- read_regions(cmdopts$blacklist)
        assert_that(is(blacklist_regions, "GRanges"))
        ## Blacklist applies to both strands
        strand(blacklist_regions) <- "*"
    }

    ## Standard nuclear chromosomes only. (chrM is excluded because it is
    ## not located in the nucleus and is thus not subject to histone
    ## modification. The unplaced scaffolds are mostly not large enough to
    ## contain even a single typically-sized peak, so little is lost by
    ## excluding them for this analysis.)
    std.chr <- extractSeqlevels("Homo sapiens", "UCSC") %>% setdiff("chrM")
    rparam <- readParam(discard = blacklist)

    tsmsg(glue("Counting reads in {width} {type} in {scount} samples.",
        width = format_bp(cmdopts$window_width),
        type = ifelse(cmdopts$bin, "bins", "windows"),
        scount = nrow(sample.table)))
    if (cmdopts$threads > 1) {
        wCountsFun <- windowCountsParallel
    } else {
        wCountsFun <- windowCounts
    }
    wcounts <- wCountsFun(
        sample.table$bam_file, spacing=cmdopts$window_spacing,
        width=cmdopts$window_width, ext=cmdopts$read_extension,
        param=rparam, bin=cmdopts$bin)

    ## Add sample metadata to colData in front of mapping stats
    colData(wcounts)$bam.files <- NULL
    colData(wcounts) %<>% cbind(sample_table, .)
    colnames(wcounts) <- sample_table[[cmdopts$sample_id_column]]

    ## Save command and options in the metadata
    metadata(sexp)$cmd_name <- na.omit(c(get_Rscript_filename(), "chipseq-count-windows.R"))[1]
    metadata(sexp)$cmd_opts <- cmdopts

    tsmsg("Saving output file")
    saveRDS(wcounts, cmdopts$output_file)
    tsmsg("Finished.")
}

R dplyr stringr optparse magrittr GenomicRanges GLUE forcats SummarizedExperiment assertthat future getopt rctutils csaw From line 3 of scripts/chipseq-count-windows.R

library(magrittr)
library(dplyr)
library(assertthat)
library(ggplot2)
library(ggforce)
library(here)
library(forcats)

ccf <- readRDS(here("saved_data", "chipseq-ccf.RDS"))
ccf.nbl <- readRDS(here("saved_data", "chipseq-ccf-noBL.RDS"))

sample.table <- readRDS(here("saved_data", "samplemeta-ChIPSeq.RDS")) %>%
    ## Ensure that time_point is a factor and can't be
    ## interpreted as a numeric
    mutate(time_point = days_after_activation %>%
               factor %>% fct_relabel(~glue("Day{.}")),
           days_after_activation = NULL)
    rename(Sample = SampleName,
           ChIP = chip_antibody,
           TimePoint = time_point,
           CellType = cell_type,
           Donor = donor_id) %>%
    mutate(TreatmentGroup = interaction(CellType, TimePoint, sep = "."))

ccftable <- lapply(names(ccf), function(i) {
    ccfValues <- ccf[[i]]
    ccf.nblValues <- ccf.nbl[[i]]
    assert_that(length(ccfValues) == length(ccf.nblValues))
    data_frame(
        Sample = i, Delay = seq(from = 0, length.out = length(ccfValues)),
        CCF = ccfValues,
        RelCCF = CCF/max(CCF),
        CCF.noBL = ccf.nblValues,
        RelCCF.noBL = CCF.noBL/max(CCF.noBL))
}) %>% do.call(what = rbind) %>% inner_join(sample.table, ., by = "Sample")

refline.table <- data.frame(
    Reference = c("Read Length (100bp)", "Nucleosome Footprint (147bp)"),
    Xintercept = c(100, 147))

{
    baseplot <- ggplot(ccftable) +
        facet_wrap(~ChIP, scales = "free") +
        aes(x = Delay, y = CCF, group = Sample, color = TreatmentGroup, linetype = NA) +
        ylim(0,NA) +
        geom_vline(data = refline.table,
                   aes(xintercept = Xintercept, linetype = Reference, color = NA),
                   color = "black", alpha = 0.5) +
        ## geom_rug(data = ccfmaxtable, sides = "b") +
        scale_color_hue(name = "Group") +
        scale_linetype(name = "Reference") +
        theme(legend.position = "bottom")
    p <- list(
        Raw = baseplot +
            geom_line(size = 0.25, linetype = "solid") +
            ggtitle("Cross-Correlation Function, Raw"),
        loess_span0.05 = baseplot +
            geom_smooth(fill = NA, method = "loess", span = 0.05, n = 500, size = 0.25, linetype = "solid") +
            ggtitle("Cross-Correlation Function, Loess-Smoothed (span = 0.05)"),
        loess_span0.075 = baseplot +
            geom_smooth(fill = NA, method = "loess", span = 0.075, n = 500, size = 0.25, linetype = "solid") +
            ggtitle("Cross-Correlation Function, Loess-Smoothed (span = 0.075)"),
        loess_span0.1 = baseplot +
            geom_smooth(fill = NA, method = "loess", span = 0.1, n = 500, size = 0.25, linetype = "solid") +
            ggtitle("Cross-Correlation Function, Loess-Smoothed (span = 0.1)"))
    pdf(here("plots", "csaw", "CCF-plots.pdf"), width = 12, height = 8)
    print(p)
    dev.off()
    pdf(here("plots", "csaw", "CCF-plots-relative.pdf"), width = 12, height = 8)
    print(lapply(p, . %>% add(aes(y = RelCCF))))
    dev.off()
    pdf(here("plots", "csaw", "CCF-plots-noBL.pdf"), width = 12, height = 8)
    print(lapply(p, . %>% add(aes(y = CCF.noBL))))
    dev.off()
    pdf(here("plots", "csaw", "CCF-plots-relative-noBL.pdf"), width = 12, height = 8)
    print(lapply(p, . %>% add(aes(y = RelCCF.noBL))))
    dev.off()
}


ccfmaxtable <- ccftable %>%
    filter(Delay >= 50) %>%
    group_by(Sample, CellType, TimePoint, Donor, ChIP, TreatmentGroup) %>%
    summarize(Delay = Delay[which.max(CCF)],
              CCF = max(CCF),
              Delay.noBL = .$Delay[which.max(CCF.noBL)],
              CCF.noBL = max(CCF.noBL)) %>%
    ungroup

ccfmaxtable.noBL <- ccftable %>%
    filter(Delay >= 50) %>%
    group_by(Sample) %>%
    do(.[which.max(.$CCF.noBL),]) %>%
    ungroup

lims <- range(c(ccfmaxtable$Delay, ccfmaxtable$Delay.noBL))

p <- ggplot(ccfmaxtable) +
    facet_wrap(~ChIP) +
    coord_fixed(xlim = lims, ylim = lims) +
    ## Reference guide lines with circles at intersection
    geom_hline(data = refline.table,
               aes(yintercept = Xintercept, linetype = Reference),
               color = "grey40") +
    geom_vline(data = refline.table,
               aes(xintercept = Xintercept, linetype = Reference),
               color = "grey40") +
    geom_circle(data = refline.table,
                aes(x0 = Xintercept, y0 = Xintercept, linetype = Reference, r = 5),
                color = "grey40", show_guide = FALSE) +
    ## Plot the actual data
    geom_point(aes(x = Delay, y = Delay.noBL, color = TreatmentGroup)) +
    scale_shape_manual(values = c(`Read Length (100bp)` = 1, `Nucleosome Footprint (147bp)` = 1)) +
    guides(colour = guide_legend(override.aes = list(size = 2))) +
    theme(legend.position = "bottom") +
    xlab("Delay of Maximum Cross-Correlation (With Blacklist)") +
    ylab("Delay of Maximum Cross-Correlation (No Blacklist)") +
    ggtitle("Delay of Maximum Cross-Correlation With and Without Blacklist")

pdf(here("plots", "csaw", "CCF-max-plot.pdf"), width = 10, height = 10)
print(p)
dev.off()

R ggplot2 dplyr magrittr here forcats ggforce assertthat From line 3 of scripts/chipseq-plot-ccf.R

wasabi::prepare_fish_for_sleuth(fish_dirs = commandArgs(TRUE), force = TRUE)

R From line 3 of scripts/convert-salmon-to-hdf5.R

suppressPackageStartupMessages({
    library(getopt)
    library(optparse)
    library(stringr)
    library(glue)
    library(magrittr)
    library(assertthat)
    library(GenomicRanges)
    library(rctutils)
})

get_options <- function(opts) {

    ## Do argument parsing early so the script exits quickly if arguments are invalid
    optlist <- list(
        ## So far this script only supports TxDb objects because
        ## figuring out the first exon and TSS from other
        ## less-structured formats is a pain.
        make_option(c("-t", "--annotation-txdb"), metavar = "TXDBNAME", type = "character",
                    help = "Name of TxDb package, or the name of a database file, to use for gene annotation"),
        make_option(c("-r", "--promoter-radius"), metavar = "RADIUS", type = "character",
                    help = "Maximum distance from a gene's transcription start site that is considered part of the promoter."),
        make_option(c("-o", "--output-file"), metavar = "FILENAME.RDS", type = "character",
                    help = "Output file name. The GRanges object containing the promoter regions will be saved here using saveRDS, so it should end in '.RDS'."),
        make_option(c("-a", "--additional-gene-info"), metavar = "FILENAME", type = "character",
                    help = "RDS/RData/xlsx/csv file containing a table of gene metadata. Row names (or the first column of the file if there are no row names) should be gene/feature IDs that match the ones used in the main annotation, and these should be unique. This can also be a GFF3 file where the metadata is in the attributes of elements of type 'gene', where the 'ID' attribute specifies the gene ID."))
    progname <- na.omit(c(get_Rscript_filename(), "generate-promoter.R"))[1]
    parser <- OptionParser(
        usage = "Usage: %prog [options] -t TXDB -r RADIUS -o OUTPUT.RDS",
        description = "Generate promoter regions for a gene annotation.

From each transcription start site, a region is extended to the specified radius in both directions to define the promoter region for that TSS. Then, any overlapping promoters that share a gene ID are merged. Note that the number of non-overlapping promoters necessarily depends on the chosen promoter radius, and different radii will give different numbers of promoters. The resulting GRanges object will have several annotation columns added: 'PromoterID', which is an arbitrary unique key for each promoter; 'GeneID', which may be NA, may be equal to the TxID for transcripts with no Gene ID, and may be non-unique if a single gene has multiple non-overlapping promoters; TxID, a CharacterList of all transcript IDs whose TSS are included in the promoter.",
option_list = optlist,
add_help_option = TRUE,
prog = progname,
epilogue = "Note that all base pair sizes (for the promoter radius) may have a suffix of 'bp', 'kbp', 'mbp', or 'tbp' (and the 'p' is optional). For example, 10kbp = 10000")

    cmdopts <- parse_args(parser, opts)
    ## Ensure that all required arguments were provided
    required.opts <- c("annotation-txdb", "output-file", "promoter-radius")
    missing.opts <- setdiff(required.opts, names(cmdopts))
    if (length(missing.opts) > 0) {
        stop(str_c("Missing required arguments: ", deparse(missing.opts)))
    }

    ## Convert bp args to numbers
    for (i in c("promoter-radius")) {
        cmdopts[[i]] %<>% parse_bp
    }

    ## Replace dashes with underscores so that all options can easily
    ## be accessed by "$"
    cmdopts %>% setNames(chartr("-", "_", names(.)))
}

## Do this early to handle "--help" before wasting time loading
## pacakges & stuff
invisible(get_options(commandArgs(TRUE)))

{
    cmdopts <- get_options(commandArgs(TRUE))
    cmdopts$help <- NULL

    tsmsg("Args:")
    print_var_vector(cmdopts)

    ## Delete the output file if it exists
    suppressWarnings(file.remove(cmdopts$output_file))
    assert_that(!file.exists(cmdopts$output_file))

    ## Only chr1-chr22,chrX,chrY
    std.chr <- extractSeqlevels("Homo sapiens", "UCSC") %>% setdiff("chrM")

    tsmsg("Reading annotation data")
    txdb <- get_txdb(cmdopts$annotation_txdb)
    tsmsg(glue("Getting {format_bp(cmdopts$promoter_radius)}-radius promoters"))
    all_promoters <- suppressWarnings(promoters(txdb, upstream = cmdopts$promoter_radius, downstream = cmdopts$promoter_radius)) %>%
        trim %>% keepSeqlevels(std.chr, pruning.mode = "coarse")

    tsmsg("Annotating promoters")
    mcols(all_promoters) %<>%
        ## Not using dplyr because it's a BioC DataFrame
        transform(TxID = tx_name,
                  tx_name = NULL,
                  tx_id = NULL)
    all_promoters$GeneID <- suppressMessages(mapIds(txdb, all_promoters$TxID, keytype = "TXNAME", column = "GENEID", multiVals = "first")) %>%
        ifelse(is.na(.), all_promoters$TxID, .)
    tsmsg("Splitting promoters by gene ID")
    gene_promoters <- split(all_promoters, all_promoters$GeneID)
    assert_that(all(lengths(gene_promoters) >= 1))
    tsmsg("Merging overlapping promoters from the same gene")
    merged_promoters <- bplapply(gene_promoters, function(gp) {
        gp_reduced <- reduce(gp)
        gp_reduced$GeneID <- gp$GeneID[1]
        names(gp_reduced) <- gp_reduced$PromoterID <-
            glue("{gene}-P{pnum}", gene = gp_reduced$GeneID, pnum = seq_along(gp_reduced))
        pgroup <- gp_reduced$PromoterID[nearest(gp, gp_reduced)]
        gp_reduced$TxID <- CharacterList(split(gp$TxID, pgroup))[gp_reduced$PromoterID]
        gp_reduced
    }) %>% unname %>% GRangesList %>% unlist

    if ("additional_gene_info" %in% names(cmdopts)) {
        tsmsg("Reading additional gene annotation metadata")
        additional_gene_info <- read_additional_gene_info(cmdopts$additional_gene_info)
        genes_without_info <- setdiff(names(gene_promoters), rownames(additional_gene_info))
        if (length(genes_without_info) > 0) {
            empty_row <- list(character(0)) %>% rep(ncol(additional_gene_info)) %>% setNames(colnames(additional_gene_info))
            single.val.cols <- sapply(additional_gene_info, function(x) all(lengths(x) == 1))
            for (i in seq_along(empty_row)) {
                if (single.val.cols[i]) {
                    empty_row[[i]] <- NA
                } else {
                    empty_row[[i]] <- list(logical(0)) %>% as(class(additional_gene_info[[i]]))
                }
            }
            empty_row %<>% DataFrame
            empty_gene_table <- empty_row[rep(1, length(genes_without_info)),] %>%
                set_rownames(genes_without_info)
            additional_gene_info %<>% rbind(empty_gene_table)
        }
        assert_that(all(merged_promoters$GeneID %in% rownames(additional_gene_info)))
        mcols(merged_promoters)[colnames(additional_gene_info)] <- additional_gene_info[merged_promoters$GeneID,]
        metadata(merged_promoters) %<>% c(metadata(additional_gene_info))
    }

    tsmsg("Saving output file")
    save_RDS_or_RDA(merged_promoters, cmdopts$output_file)
    invisible(NULL)
}

R stringr optparse magrittr GenomicRanges GLUE assertthat getopt rctutils From line 3 of scripts/generate-promoters.R

suppressMessages({
    library(rtracklayer)
    library(assertthat)
    library(BSgenome.Hsapiens.UCSC.hg38)
})

{
    outfile <- snakemake@output[[1]]
    assert_that(is.character(outfile))

    mySession <- browserSession()
    genome(mySession) <- "hg38"
    tab <- getTable(ucscTableQuery(mySession, "cpgIslandExtUnmasked"))
    gr <- makeGRangesFromDataFrame(tab, start.field = "chromStart", end.field = "chromEnd",
                                   starts.in.df.are.0based = TRUE, keep.extra.columns = TRUE,
                                   seqinfo = seqinfo(BSgenome.Hsapiens.UCSC.hg38))
    ## GRanges already knows the length of each feature, so this field is
    ## redundant.
    assert_that(all(width(gr) == gr$length))
    mcols(gr)$length <- NULL
    seqinfo(gr) <- seqinfo(BSgenome.Hsapiens.UCSC.hg38)

    saveRDS(gr, outfile)
}

R BSgenome.Hsapiens.UCSC.hg38 assertthat From line 1 of scripts/get-CpG.R

suppressPackageStartupMessages(suppressMessages({
    library(stringr)
    library(glue)
    library(SRAdb)
    library(assertthat)
    library(rctutils)

    sra_con <- {
        sqlfile <- here("saved_data", "SRAmetadb.sqlite")
        if(!file.exists(sqlfile)) {
            getSRAdbFile(destdir = dirname(sqlfile),
                         destfile = str_c(basename(sqlfile), ".gz"))
        }
        stopifnot(file.exists(sqlfile))
        dbConnect(SQLite(),sqlfile)
    }

    ## Check if we can use ascp download method, otherwise fall back
    ## to FTP
    getSRAfile <- function(...) {
        SRAdb::getSRAfile(..., srcType = "ftp")
    }

    ascp.path <- first_accessible_path(
        c(Sys.which("ascp"),
          path.expand("~/.aspera/connect/bin/ascp")), mode = 1)
    if (!is.na(ascp.path)) {
        ascp.key.file <- normalizePath(first.accessible(
            file.path(dirname(ascp.path),
                      c("asperaweb_id_dsa.openssh",
                        "../etc/asperaweb_id_dsa.openssh"))))
        if (!is.na(ascp.key.file)) {
            cmd <- glue(
                "{ascp} -T -k1 -i {keyfile}",
                ascp = shQuote(ascp.path),
                keyfile = shQuote(ascp.key.file))
            getSRAfile <- function(...) {
                SRAdb::getSRAfile(..., srcType = "fasp",
                                  ascpCMD = cmd)
            }
        }
    }

}))

{
    sra_dir <- "sra_files"

    sra_runs <- commandArgs(TRUE)
    assert_that(all(str_detect(sra_runs, "^SRR")))

    getSRAfile(sra_runs, sra_con, destDir = sra_dir, makeDirectory = TRUE)

    expected_files <- here(sra_dir, str_c(sra_runs, ".sra"))
    assert_that(all(file.exists(expected_files)))
    invisible(NULL)                     # Avoid output on console
}

R stringr GLUE assertthat rctutils SRAdb From line 3 of scripts/get-sra-run-files.R

suppressMessages({
    library(rtracklayer)
    library(stringr)
    library(assertthat)
    library(dplyr)
    library(BSgenome.Hsapiens.UCSC.hg19)
})

{
    chainfile <- snakemake@input[["chain"]]
    chain <- import.chain(chainfile)

    outfile <- snakemake@output[[1]]
    assert_that(is.character(outfile))

    mySession <- browserSession()
    genome(mySession) <- "hg19"
    sites.table <- getTable(ucscTableQuery(mySession, track = "tfbsConsSites", table = "tfbsConsSites")) %>%
        fac2char
    names.table <- getTable(ucscTableQuery(mySession, track = "tfbsConsSites", table = "tfbsConsFactors")) %>%
        fac2char
    ## Keep only human entries, interpret "N" as NA
    names.table$id[names.table$id == "N"] <- NA
    names.table %<>% filter(species == "human") %>% select(-species) %>% droplevels %>%
        group_by(name) %>% summarize_all(. %>% str_c(collapse = ","))
    assert_that(!anyDuplicated(names.table$name))
    full.table <- sites.table %>% inner_join(names.table, "name")

    gr <- makeGRangesFromDataFrame(full.table, start.field = "chromStart", end.field = "chromEnd",
                                   starts.in.df.are.0based = TRUE, keep.extra.columns = TRUE,
                                   seqinfo = seqinfo(BSgenome.Hsapiens.UCSC.hg19))
    gr.lifted <- liftOverLax(gr, chain, allow.gap = 2) %>% .[lengths(.) == 1] %>% unlist
    save_RDS_or_RDA(gr.lifted, outfile)
}

R dplyr stringr BSgenome.Hsapiens.UCSC.hg19 assertthat From line 1 of scripts/get-tfbs-conserved.R

library(rctutils)

liftOver_motifMap(infile = snakemake@input[["bed"]], chainfile = snakemake@input[["chain"]],
                  outfile = snakemake@output[["bed"]], allow.gap = 2)

R rctutils From line 1 of scripts/liftOver-MotifMap.R

library(getopt)
library(optparse)

get_options <- function(opts) {

    ## Do argument parsing early so the script exits quickly if arguments are invalid
    optlist <- list(
        make_option(c("-i", "--idr-file"), metavar = "FILE", type = "character",
                    help = "(REQUIRED) Output file from idr script containing computed IDR values."),
        make_option(c("-o", "--output-file"), metavar = "PLOTS.PDF", type = "character",
                    help = "(REQUIRED) PDF file in which to create plots"),
        make_option(c("-A", "--sample-A-name"), type = "character", default = "Sample A",
                    help = "Name of the first sample. This will be used to identify the sample in plot titles and axis labels."),
        make_option(c("-B", "--sample-B-name"), type = "character", default = "Sample B",
                    help = "Name of the second sample. This will be used to identify the sample in plot titles and axis labels."),
        make_option(c("-P", "--sample-name-common-prefix"), type = "character",
                    help = "Common prefix to both sample names. Used to reduced redundancy in plot titles. Will be stripped from given sample names if present."),
        make_option(c("-s", "--prefix-separator"), type = "character", default = "._-/:| ",
                    help = "Characters that will be stripped from the split point between prefix and sample name."))
    progname <- na.omit(c(get_Rscript_filename(), "plot-idr.R"))[1]
    parser <- OptionParser(
        usage = "Usage: %prog [options] -i FILE -o PLOTS.PDF [ -A \"Sample A\" -B \"Sample B\" ]",
        description = "Generate QC plots for an IDR analysis of two samples.",
        option_list = optlist,
        add_help_option = TRUE,
        prog = progname,
        epilogue = "")

    cmdopts <- parse_args(parser, opts)
    ## Ensure that all required arguments were provided
    required.opts <- c("idr-file", "output-file")
    missing.opts <- setdiff(required.opts, names(cmdopts))
    if (length(missing.opts) > 0) {
        stop(str_c("Missing required arguments: ", deparse(missing.opts)))
    }
    cmdopts %>% setNames(str_replace_all(names(.), "-", "_"))
}

## Do this early to handle "--help" before wasting time loading
## pacakges & stuff
get_options(commandArgs(TRUE))

library(magrittr)
library(dplyr)
library(ggplot2)
library(scales)
library(ks)
library(reshape2)
library(stringr)
library(rex)
library(glue)
library(rctutils)

cutIDR <- function(x, thresholds = c(0.01, 0.05, 0.1)) {
    fullbreaks <- c(0, thresholds, 1)
    labels <- c(glue(" <= {thresholds}"), glue(" > {tail(thresholds, 1)}"))
    cut(x, breaks = fullbreaks, labels = labels) %>%
        factor(levels = rev(levels(.)))
}

{
    cmdopts <- get_options(commandArgs(TRUE))
    ## myargs <- c("-i", "idr_analysis/epic_hg38.analysisSet/H3K4me3_condition.ALL_D4659vsD5053/idrValues.txt",
    ##             "-o", "idr_analysis/epic_hg38.analysisSet/H3K4me3_condition.ALL_D4659vsD5053/idrplots.pdf",
    ##             "-A", "H3K4me3_ALL_D4659", "-B", "H3K4me3_ALL_D5053",
    ##             "-P", "H3K4me3_ALL")
    ## cmdopts <- get_options(myargs)
    cmdopts$help <- NULL

    tsmsg("Args:")
    print_var_vector(cmdopts)

    if (!is.null(cmdopts$sample_name_common_prefix)) {
        prefix.rx <- rex(start, cmdopts$sample_name_common_prefix)
        cmdopts$sample_A_name %<>% str_replace(prefix.rx, "")
        cmdopts$sample_B_name %<>% str_replace(prefix.rx, "")

        if (!is.null(cmdopts$prefix_separator) &&
            str_length(cmdopts$prefix_separator) > 0) {
            post.sep.rx <- rex(some_of(cmdopts$prefix_separator), end)
            pre.sep.rx <- rex(start, some_of(cmdopts$prefix_separator))

            cmdopts$sample_name_common_prefix %<>%
                str_replace(post.sep.rx, "")
            cmdopts$sample_A_name %<>% str_replace(pre.sep.rx, "")
            cmdopts$sample_B_name %<>% str_replace(pre.sep.rx, "")
        }
    }

    idrtab <- read_idr_table(cmdopts$idr_file)

    idrtab %<>%
        mutate(GlobalIDR.Cut = cutIDR(GlobalIDR),
               LocalIDR.Cut = cutIDR(LocalIDR),
               rankA = rank(scoreA),
               rankB = rank(scoreB),
               rankBinA = ceiling(rankA / length(rankA) * 20) %>% factor,
               rankBinB = ceiling(rankB / length(rankB) * 20) %>% factor)

    title_samples <- str_interp("${sample_A_name} vs ${sample_B_name}", cmdopts)
    title_sampleA <- str_interp("${sample_A_name}", cmdopts)
    title_sampleB <- str_interp("${sample_B_name}", cmdopts)
    if (!is.null(cmdopts$sample_name_common_prefix)) {
        title_samples <- str_c(cmdopts$sample_name_common_prefix, ", ", title_samples)
        title_sampleA <- str_c(cmdopts$sample_name_common_prefix, " ", title_sampleA)
        title_sampleB <- str_c(cmdopts$sample_name_common_prefix, " ", title_sampleB)
    }

    plotlist <- list(
        RankCons = ggplot(idrtab %>% arrange(desc(GlobalIDR))) +
            aes(x = rankA, y = rankB,
                color = GlobalIDR.Cut) +
            geom_point() +
            scale_color_manual(name = "IDR", values = discrete_gradient(nlevels(idrtab$GlobalIDR.Cut))) +
            coord_fixed() +
            theme(legend.position = "bottom") +
            xlab(str_interp("Peak Rank in ${cmdopts$sample_A_name}")) +
            ylab(str_interp("Peak Rank in ${cmdopts$sample_B_name}")) +
            ggtitle(str_interp("Rank consistency plot for ${title_samples}")),
        ScoreCons = ggplot(idrtab %>% arrange(desc(GlobalIDR))) +
            aes(x = scoreA, y = scoreB,
                color = cutIDR(GlobalIDR)) +
            geom_point() +
            scale_x_log10() + scale_y_log10() +
            scale_color_manual(name = "IDR", values = discrete_gradient(nlevels(idrtab$GlobalIDR.Cut))) +
            coord_fixed() +
            theme(legend.position = "bottom") +
            xlab(str_interp("Peak Score in ${cmdopts$sample_A_name}")) +
            ylab(str_interp("Peak Score in ${cmdopts$sample_B_name}")) +
            ggtitle(str_interp("Score consistency plot for ${title_samples}")))

    neglog10_trans <- neglog_trans(10)

    ## Sample B rank vs IDR
    pointdata <- idrtab %>% transmute(x = rankA, y = -log10(GlobalIDR))
    H <- pointdata %>% Hbcv.diag(binned = TRUE)
    k <- pointdata %>%
        as.matrix %>%
        kde(gridsize = 1024, bgridsize = rep(1024, 2), verbose = TRUE,
            H = H/8, binned = TRUE)
    ## Sometimes the estimate goes a bit negative, which is no good
    densdata <- melt(k$estimate) %>%
        transmute(
            x = k$eval.points[[1]][Var1],
            y = k$eval.points[[2]][Var2],
            Density = value %>% pmax(0),
            ## Part of a hack to make the alpha look less bad
            AlphaDens = value %>% pmax(1e-15))

    plotlist$DensA <-
        ggplot(densdata) +
        aes(x = x, y = neglog10_trans$inverse(y), alpha = Density) +
        geom_raster(fill = muted("blue",c = 90), interpolate = TRUE) +
        scale_alpha(limits = c(0, max(densdata$Density)/3), range = c(0,1), guide = FALSE) +
        theme(legend.position = "bottom") +
        coord_cartesian(expand = FALSE) +
        xlab(str_interp("Peak Rank in ${cmdopts$sample_A_name}")) +
        scale_y_continuous(name = "IDR", trans = neglog10_trans) +
        ggtitle(str_interp("IDR vs Peak Rank for ${title_sampleA}"))

    ## Sample B rank vs IDR
    pointdata <- idrtab %>% transmute(x = rankB, y = -log10(GlobalIDR))
    H <- pointdata %>% Hbcv.diag(binned = TRUE)
    k <- pointdata %>%
        as.matrix %>%
        kde(gridsize = 1024, bgridsize = rep(1024, 2), verbose = TRUE,
            H = H/8, binned = TRUE)
    ## Sometimes the estimate goes a bit negative, which is no good
    densdata <- melt(k$estimate) %>%
        transmute(
            x = k$eval.points[[1]][Var1],
            y = k$eval.points[[2]][Var2],
            Density = value %>% pmax(0),
            ## Part of a hack to make the alpha look less bad
            AlphaDens = value %>% pmax(1e-15))

    plotlist$DensB <-
        ggplot(densdata) +
        aes(x = x, y = neglog10_trans$inverse(y), alpha = Density) +
        geom_raster(fill = muted("blue",c = 90), interpolate = TRUE) +
        scale_alpha(limits = c(0, max(densdata$Density)/3), range = c(0,1), guide = FALSE) +
        theme_bw() +
        coord_cartesian(expand = FALSE) +
        xlab(str_interp("Peak Rank in ${cmdopts$sample_B_name}")) +
        scale_y_continuous(name = "IDR", trans = neglog10_trans) +
        ggtitle(str_interp("IDR vs Peak Rank for ${title_sampleB}"))

    pdf(cmdopts$output_file)
    print(plotlist)
    dev.off()
}

R ggplot2 dplyr stringr optparse magrittr reshape2 GLUE scales getopt rctutils rex ks From line 6 of scripts/plot-idr.R

library(graphite)
library(here)
library(glue)
library(BiocParallel)
library(rctutils)

use_futures("multicore")

target.species <- "hsapiens"
dbnames <- pathwayDatabases() %>%
    filter(species == target.species) %$% database %>%
    as.character

graphite.species <- "hsapiens"
dbs.raw <- dbnames %>% set_names %>%
    lapply(pathways, species = graphite.species)
idtypes <- c(entrez = "entrez", ensembl = "ENSEMBL", symbol = "symbol")
dbs <- suppressMessages(lapply(idtypes, . %>% bplapply(dbs.raw, convertIdentifiers, to = .)))

for (i in names(dbs)) {
    fname <- here("saved_data", glue("graphite-{i}.RDS"))
    saveRDS(dbs[[i]], fname)
}

spia.outdir <- here("saved_data", "SPIA")
dir.create(spia.outdir, recursive = TRUE, showWarnings = FALSE)

## Prepare SPIA databases
for (idtype in names(dbs)) {
    db <- dbs[[idtype]]
    bpmapply(prepareSPIA,
             db = db,
             pathwaySetName = file.path(spia.outdir, glue("graphite-{idtype}-{names(db)}Ex")))
}

R GLUE here graphite rctutils From line 2 of scripts/prepare-graphite.R

library(GSEABase)
library(BiocParallel)
library(glue)
library(assertthat)
library(rctutils)

use_futures("multicore")

split.by.category.and.subcategory <- function(x, cat, subcat) {
    cat <- droplevels(as.factor(cat))
    subcat <- droplevels(as.factor(subcat))
    subcat.by.cat <- lapply(split(subcat, cat), droplevels)
    res <- split(x, cat)
    for (ci in levels(cat)) {
        cat.subcat <- subcat.by.cat[[ci]]
        for (sci in levels(cat.subcat)) {
            scname <- str_c(ci, ".", sci)
            ## This regexp stuff makes sure that "CP:KEGG" etc. are
            ## included in the "CP" subcategory
            scregexp <- regex(str_c("^", quotemeta(sci), "(\\:|$)"))
            res[[scname]] <- res[[ci]][!is.na(cat.subcat) & str_detect(cat.subcat, scregexp)]
        }
    }
    SimpleList(res)
}

extract.bset.metadata <- function(x) {
    ## Extract relevant metadata about each gene set and save it in a data frame
    bset.urls <- CharacterList(lapply(x, function(a) grep("^file:", urls(a), perl = TRUE, value = TRUE, invert = TRUE)))
    bset.urls <- rtracklayer:::pasteCollapse(bset.urls)
    bset.urls[bset.urls == ""] <- NA
    assert_that(all(lengths(bset.urls) == 1))
    bsets.meta <- data.frame(row.names = names(x), setName = names(x),
                             category = categories, subCategory = subcategories,
                             setIdentifier = unlist(lapply(x, setIdentifier)),
                             contributor = unlist(lapply(x, contributor)),
                             description = unlist(lapply(x, description)),
                             url = bset.urls)
    bsets.meta
}

## This just loads all the msigdb gene sets into R and saves them as an
## RDS file. Or it reads that RDS file if it already exists.
{
    ## Load MSigDB
    if (file.exists("saved_data/msigdb_v6.1.RDS")) {
        bsets_symbol <- readRDS("saved_data/msigdb_v6.1.RDS")
    } else {
        bsets_symbol <- getBroadSets("saved_data/msigdb_v6.1.xml")
        saveRDS(bsets_symbol, "saved_data/msigdb_v6.1.RDS")
    }
    id_converters <- list(symbol = identity,
                          entrez = . %>% mapIdentifiers(EntrezIdentifier("org.Hs.eg")),
                          ensembl = . %>% mapIdentifiers(ENSEMBLIdentifier("org.Hs.eg")))
    bsets <- bplapply(id_converters, function(converter) converter(bsets_symbol))
    sids <- unlist(lapply(as.list(bsets_symbol), setIdentifier))
    # Need to tell the converted gene sets that their gene IDs have
    # been converted
    for (idtype in setdiff(names(bsets), "symbol")) {
        message("Fixing idtype ", idtype)
        bsetlist <- as.list(bsets[[idtype]])
        bsetlist <- bplapply(seq_along(bsetlist), function(i) {
            x <- bsetlist[[i]]
            setIdentifier(x) <- sids[i]
            x
        })
        bsets[[idtype]] <- GeneSetCollection(bsetlist)
    }
    ## Split into categories
    ctypes <- lapply(bsets_symbol, collectionType)
    categories <- factor(unlist(lapply(ctypes, bcCategory)))
    subcategories <- factor(unlist(lapply(ctypes, bcSubCategory)))
    all.collections <- lapply(bsets, split.by.category.and.subcategory,
                             cat = categories, subcat = subcategories)
    for (i in names(all.collections)) {
        fname <- glue("saved_data/msigdb-{i}.RDS")
        saveRDS(all.collections[[i]], fname)
    }

    bsets.meta <- extract.bset.metadata(bsets_symbol)
    ## Munge some overly verbose descriptions (Note: These may no
    ## longer be applicable, but running them anyway won't hurt.)
    bsets.meta$description %<>%
        str_replace(
            pattern = "Genes with promoter regions \\[-2kb,2kb\\] around transcription start site containing motif (\\w+)\\. Motif does not match any known transcription factor",
            replacement = "Motif \\1 (no known TF) in gene promoter (2kb radius)") %>%
        str_replace(
            pattern = "Genes with promoter regions \\[-2kb,2kb\\] around transcription start site containing the motif (\\w+) which matches annotation for (.+)",
            replacement = "Motif \\1 (matches \\2) in gene promoter (2kb radius)") %>%
        str_replace(
            pattern = "^Genes involved in ",
            replacement = "") %>%
        str_replace(
            pattern = "^Genes annotated by the GO term ",
            replacement = "")
    saveRDS(bsets.meta, "saved_data/msigdb-meta.RDS")
}

R GLUE assertthat GSEABase rctutils From line 1 of scripts/prepare-msigdb.R

library(getopt)
library(optparse)
library(rctutils)

## Don't default to more than 4 cores
num.cores <- min(availableCores(), 4)

get_options <- function(opts) {

    ## Do argument parsing early so the script exits quickly if arguments are invalid
    optlist <- list(
        make_option(c("-s", "--samplemeta-file"), metavar = "FILENAME.RDS", type = "character",
                    help = "(REQUIRED) RDS/RData/xlsx/csv file containing a table of sample metadata. Any existing rownames will be replaced with the values in the sample ID  column (see below)."),
        make_option(c("-c", "--sample-id-column"), type = "character", default = "Sample",
                    help = "Sample metadata column name that holds the sample IDs. These will be substituted into '--bam-file-pattern' to determine the BAM file names."),
        make_option(c("-f", "--filter-sample-ids"), type = "character",
                    help = "Comma-separated list of sample IDs. If this options is provided, only the specified sample IDs will be used."),
        make_option(c("-p", "--bam-file-pattern"), metavar = "PATTERN", type = "character",
                    help = "(REQUIRED) Format string to convert sample IDs into BAM file paths. This should contain the string '{SAMPLE}' wherever the sample ID should be substituted (this can occur multiple times),. Example: 'bam_files/Sample_{SAMPLE}/Aligned.bam"),
        make_option(c("-o", "--output-file"), metavar = "FILENAME.RDS", type = "character",
                    help = "(REQUIRED) Output file name. The SummarizedExperiment object containing the counts will be saved here using saveRDS, so it should end in '.RDS'."),
        make_option(c("-b", "--expected-bam-files"), metavar = "BAMFILE1,BAMFILE2,...", type = "character",
                    help = "Comma-separated list of bam file names expected to be used as input. This argument is optional, but if it is provided, it will be checked against the list of files determined from '--samplemeta-file' and '--bam-file-pattern', and an error will be raised if they don't match exactly."),
        make_option(c("-j", "--threads"), metavar = "N", type = "integer", default = num.cores,
                    help = "Number of threads to use while counting reads"),
        make_option(c("-t", "--annotation-txdb"), metavar = "PACKAGENAME", type = "character",
                    help = "Name of TxDb package, or the name of a database file, to use for gene annotation"),
        make_option(c("-g", "--annotation-gff"), metavar = "FILENAME", type = "character",
                    help = "File Name of GFF3 file to use for gene annotation."),
        make_option(c("-f", "--gff-exon-featuretype"), metavar = "FEATURETYPE", type = "character", default = "exon",
                    help = "GFF feature type to use"),
        make_option(c("-i", "--gff-geneid-attr"), metavar = "ATTRNAME", type = "character", default = "gene_id",
                    help = "GFF feature attribute to use as a feature's Gene ID."),
        make_option(c("-e", "--gff-gene-featuretype"), metavar = "FEATURETYPE", type = "character", default = "gene",
                    help = "GFF feature type from which gene metadata should be extracted."),
        make_option(c("-r", "--annotation-rds"), metavar = "FILENAME", type = "character",
                    help = "File Name of RDS or RData file to use for gene annotation. It should contain a single GRanges or GRangesList object (or something that can be coereced into one), with each element representing a gene/feature to be counted. Any metadata columns on the object will be carried through to the output SummarizedExperiment."),
        make_option(c("--annotation-saf"), metavar = "FILENAME", type = "character",
                    help = "File Name of RDS/RData/xlsx/csv file containing gene annotations in SAF format (i.e. 5 columns named GeneID, Chr, Start, End, Strand). Additional columns beyond the first 5 will be retained as metadata columns on the genes/exons."),
        make_option(c("-a", "--additional-gene-info"), metavar = "FILENAME", type = "character",
                    help = "RDS/RData/xlsx/csv file containing a table of gene metadata. Row names (or the first column of the file if there are no row names) should be gene/feature IDs that match the ones used in the main annotation, and these should be unique. This can also be a GFF3 file where the metadata is in the attributes of elements of type specified by '--gff-gene-featuretype' ('gene' by default), where the 'ID' attribute specifies the gene ID."))
    progname <- na.omit(c(get_Rscript_filename(), "rnaseq-count.R"))[1]
    parser <- OptionParser(
        usage = "Usage: %prog [options] -s SAMPLEMETA.RDS -p PATTERN -o SUMEXP.RDS [ -t TXDB.PACKAGE.NAME | -g ANNOTATION.GFF3 | -r ANNOTATION.RDS ]",
        description = "Count reads in genes using Rsubread::featureCounts.

Counts are performed for stranded, non-stranded, and reverse-stranded modes, and are stored along with the sample and gene metadata in a SummarizedExperiment object. Note that the '-s', '-p', and '-o' arguments are all required, since they specify the input and output files. Also, exactly one of '-t', '-g', or '-r' is required to specify the annotation.",
option_list = optlist,
add_help_option = TRUE,
prog = progname,
epilogue = "")

    cmdopts <- parse_args(parser, opts)
    ## Ensure that all required arguments were provided
    required.opts <- c("samplemeta-file", "bam-file-pattern", "output-file")
    missing.opts <- setdiff(required.opts, names(cmdopts))
    if (length(missing.opts) > 0) {
        stop(str_c("Missing required arguments: ", deparse(missing.opts)))
    }

    ## Split list arguments
    for (i in c("filter-sample-ids", "expected-bam-files")) {
        if (i %in% names(cmdopts)) {
            cmdopts[[i]] %<>% str_split(",") %>% unlist
        }
    }

    ## Ensure that exactly one annotation was provided
    annot.opts <- c("annotation-txdb", "annotation-gff", "annotation-rds", "annotation-saf")
    provided.annot.opts <- intersect(annot.opts, names(cmdopts))
    if (length(provided.annot.opts) < 1) {
        stop("No annotation provided")
    } else if (length(provided.annot.opts) > 1) {
        stop("Multiple annotations provided. Please provide only one annotation.")
    }

    ## Replace dashes with underscores so that all options can easily
    ## be accessed by "$"
    cmdopts %>% setNames(chartr("-", "_", names(.)))
}

## Do this early to handle "--help" before wasting time loading
## pacakges & stuff
invisible(get_options(commandArgs(TRUE)))

library(assertthat)
library(magrittr)
library(stringr)

library(SummarizedExperiment)

## Guess type of ID (currenly unused)
identify.ids <- function(ids, db = "org.Hs.eg.db", idtypes = c("ENTREZID", "ENSEMBL", "UNIGENE"), threshold = 0.5) {
    if (is.character(db)) {
        library(db, character.only = TRUE)
        pos <- str_c("package:", db)
        db <- get(db, pos)
    }
    assert_that(is(db, "AnnotationDb"))
    idtypes %<>% intersect(keytypes(db))
    assert_that(length(idtypes) > 0)
    idcounts <- sapply(idtypes, function(idtype) {
        sum(ids %in% keys(db, idtype))
    }) %>% setNames(idtypes)
    idcounts %<>% sort(decreasing = TRUE)
    result <- names(idcounts)[1]
    if (idcounts[result] / length(ids) < threshold) {
        stop(glue("Could not identify more than {format(threshold * 100, digits = 2)}%% of given IDs as any of {deparse(idtypes)}"))
    }
    tsmsg("Detected gene IDs as ", result)
    attr(result, "counts") <- idcounts
    return(result)
}

{
    cmdopts <- get_options(commandArgs(TRUE))
    ## myargs <- c("-s", "./saved_data/samplemeta-RNASeq.RDS", "-c", "SRA_run", "-p", "aligned/rnaseq_star_hg38.analysisSet_gencode.v25/%s/Aligned.sortedByCoord.out.bam", "-o", "sexp.rds", "-j", "2", "-g", "~/references/hg38/gencode.v25.gff3")
    ## cmdopts <- get_options(myargs)
    cmdopts$help <- NULL

    cmdopts$threads %<>% round %>% max(1)
    if (cmdopts$threads > 1) {
        use_futures("multicore", workers = cmdopts$threads, quiet = TRUE)
    } else {
        use_futures("sequential", quiet = TRUE)
    }
    tsmsg(glue("Using {cmdopts$threads} cores."))

    tsmsg("Args:")
    print_var_vector(cmdopts)

    ## Delete the output file if it exists
    suppressWarnings(file.remove(cmdopts$output_file))
    assert_that(!file.exists(cmdopts$output_file))

    tsmsg("Loading sample metadata")
    samplemeta <- read_table_general(cmdopts$samplemeta_file)

    tsmsg("Got metadata for ", nrow(samplemeta), " samples")

    assert_that(cmdopts$sample_id_column %in% colnames(samplemeta))
    assert_that(!anyDuplicated(samplemeta[[cmdopts$sample_id_column]]))

    rownames(samplemeta) <- samplemeta[[cmdopts$sample_id_column]]

    samplemeta$bam_file <- glue(cmdopts$bam_file_pattern, SAMPLE = samplemeta[[cmdopts$sample_id_column]], .envir = emptyenv())

    if (!is.null(cmdopts$filter_sample_ids)) {
        tsmsg("Selecting only ", length(cmdopts$filter_sample_ids), " specified samples.")
        assert_that(all(cmdopts$filter_sample_ids %in% samplemeta[[cmdopts$sample_id_column]]))
        samplemeta %<>% .[.[[cmdopts$sample_id_column]] %in% cmdopts$filter_sample_ids,]
    }

    if ("expected_bam_files" %in% names(cmdopts)) {
        tryCatch({
            assert_that(setequal(samplemeta$bam_file, cmdopts$expected_bam_files))
            tsmsg("Sample metadata contains all expected bam files")
        }, error = function(...) {
            unexpected_existing <- setdiff(samplemeta$bam_file, cmdopts$expected_bam_files)
            expected_but_missing <- setdiff(cmdopts$expected_bam_files, samplemeta$bam_file)
            if (length(unexpected_existing) > 0) {
                tsmsg(glue("Got unexpected bam files: {deparse(unexpected_existing)}"))
            }
            if (length(expected_but_missing) > 0) {
                tsmsg(glue("Didn't find expected bam files: {deparse(expected_but_missing)}"))
            }
            stop("Bam file list was not as expected")
        })
    }

    assert_that(all(file.exists(samplemeta$bam_file)))

    tsmsg("Reading annotation data")

    if ("annotation_txdb" %in% names(cmdopts)) {
        txdb <- get_txdb(cmdopts$annotation_txdb)
        annot <- exonsBy(txdb, "gene")
    } else if ("annotation_gff" %in% names(cmdopts)) {
        annot <- cmdopts %$%
            read_annotation_from_gff(
                annotation_gff,
                exonFeatureType = gff_exon_featuretype,
                geneIdAttr = gff_geneid_attr,
                geneFeatureType = gff_gene_featuretype)
    } else if ("annotation_rds" %in% names(cmdopts)) {
        annot <- read_annotation_from_rdata(cmdopts$annotation_rds)
    } else if ("annotation_saf" %in% names(cmdopts)) {
        annot <- read_annotation_from_saf(cmdopts$annotation_saf)
    }

    assert_that(is(annot, "GRangesList"))
    tsmsg("Annotation has ", length(annot), " features")

    if ("additional_gene_info" %in% names(cmdopts)) {
        tsmsg("Reading additional gene annotation metadata")
        additional_gene_info <- read_additional_gene_info(cmdopts$additional_gene_info)
        genes_without_info <- setdiff(names(annot), rownames(additional_gene_info))
        if (length(genes_without_info) > 0) {
            empty_row <- list(character(0)) %>% rep(ncol(additional_gene_info)) %>% setNames(colnames(additional_gene_info))
            single.val.cols <- sapply(additional_gene_info, function(x) all(lengths(x) == 1))
            for (i in seq_along(empty_row)) {
                if (single.val.cols[i]) {
                    empty_row[[i]] <- NA
                } else {
                    empty_row[[i]] <- list(logical(0)) %>% as(class(additional_gene_info[[i]]))
                }
            }
            empty_row %<>% DataFrame
            empty_gene_table <- empty_row[rep(1, length(genes_without_info)),] %>%
                set_rownames(genes_without_info)
            additional_gene_info %<>% rbind(empty_gene_table)
        }
        assert_that(all(names(annot) %in% rownames(additional_gene_info)))
        mcols(annot)[colnames(additional_gene_info)] <- additional_gene_info[names(annot),]
        metadata(annot) %<>% c(metadata(additional_gene_info))
    }

    saf <- grl_to_saf(annot)

    if (all(lengths(annot) == 1)) {
        annot.mcols <- mcols(annot)
        annot <- unlist(annot)
        mcols(annot) <- annot.mcols
        rm(annot.mcols)
    }

    empty.counts <- matrix(NA, nrow = length(annot), ncol = nrow(samplemeta))
    sexp <- SummarizedExperiment(
        assays = List(
            counts = empty.counts,
            sense.counts = empty.counts,
            antisense.counts = empty.counts
        ),
        colData = as(samplemeta, "DataFrame"),
        rowRanges = annot,
        metadata = list()
    )
    colnames(sexp) <- colData(sexp)[[cmdopts$sample_id_column]]
    rownames(sexp) <- names(annot)

    tsmsg("Computing sense counts")
    sense.fc <- featureCountsParallel(
        samplemeta$bam_file, annot.ext = saf,
        strandSpecific = 1)
    tsmsg("Computing antisense counts")
    antisense.fc <- featureCountsParallel(
        samplemeta$bam_file, annot.ext = saf,
        strandSpecific = 2)
    tsmsg("Computing unstranded counts")
    unstranded.fc <- featureCountsParallel(
        samplemeta$bam_file, annot.ext = saf,
        strandSpecific = 0)
    assay(sexp, "counts")[,] <- unstranded.fc$counts
    assay(sexp, "sense.counts")[,] <- sense.fc$counts
    assay(sexp, "antisense.counts")[,] <- antisense.fc$counts

    count.stats <- List(counts = unstranded.fc$stat,
                        sense.counts = sense.fc$stat,
                        antisense.counts = antisense.fc$stat) %>%
        endoapply(function(x) {
            x <- data.frame(set_colnames(t(x[,-1]), x[[1]]))
            rownames(x) <- colnames(sexp)
            x
        })
    metadata(sexp)$stat <- count.stats

    tsmsg("Saving SummarizedExperiment")
    save_RDS_or_RDA(sexp, cmdopts$output_file)
    invisible(NULL)
}

R stringr optparse magrittr SummarizedExperiment assertthat getopt rctutils From line 3 of scripts/rnaseq-count.R

library(getopt)
library(optparse)
library(assertthat)
library(rex)
library(rctutils)

get_options <- function(opts) {

    ## Do argument parsing early so the script exits quickly if
    ## arguments are invalid
    optlist <- list(
        make_option(c("-q", "--transcript-quant"), metavar = "SEXP.RDS", type = "character",
                    help = "File name of an R data file containing a RangedSummarizedExperiment of transcript abundances. These will be used to select the highest-expressed TSS for each gene."),
        ## So far this script only supports TxDb objects because
        ## figuring out the first exon and TSS from other
        ## less-structured formats is a pain.
        make_option(c("-t", "--annotation-txdb"), metavar = "TXDBNAME", type = "character",
                    help = "Name of TxDb package, or the name of a database file, to use for gene annotation"),
        make_option(c("-o", "--output-file"), metavar = "FILENAME.RDS", type = "character",
                    help = "Output file name. The GRanges object containing the promoter regions will be saved here using saveRDS, so it should end in '.RDS'."),
        make_option(c("-a", "--additional-gene-info"), metavar = "FILENAME", type = "character",
                    help = "RDS/RData/xlsx/csv file containing a table of gene metadata. Row names (or the first column of the file if there are no row names) should be gene/feature IDs that match the ones used in the main annotation, and these should be unique. This can also be a GFF3 file where the metadata is in the attributes of elements of type 'gene', where the 'ID' attribute specifies the gene ID."))
    progname <- na.omit(c(get_Rscript_filename(), "rnaseq-count.R"))[1]
    parser <- OptionParser(
        usage = "Usage: %prog [options] -q SEXP.RDS -t TXDB -o OUTPUT.RDS",
        description = "Select the most abundant TSS for each gene.

For each gene, transcripts are grouped by TSS, and their average abundances are added up. The TSS with the largest sum of average transcript abundances is selected as the representative TSS for that gene. These are all stored in a GRanges object in the output file. The resulting GRanges object will be annotated with a GeneID column. For transcripts with no associaated Gene ID, the GeneID column will be identical to the TxID column. Since a single TSS is being chosen for each gene, the GeneID column should not contain any duplicates.",
option_list = optlist,
add_help_option = TRUE,
prog = progname)

    cmdopts <- parse_args(parser, opts)
    ## Ensure that all required arguments were provided
    required.opts <- c("annotation-txdb", "output-file", "transcript-quant")
    missing.opts <- setdiff(required.opts, names(cmdopts))
    if (length(missing.opts) > 0) {
        stop(str_c("Missing required arguments: ", deparse_onestring(missing.opts)))
    }

    ## Replace dashes with underscores so that all options can easily
    ## be accessed by "$"
    cmdopts %>% setNames(chartr("-", "_", names(.)))
}

## Do this early to handle "--help" before wasting time loading
## pacakges & stuff
invisible(get_options(commandArgs(TRUE)))

library(assertthat)
library(dplyr)
library(magrittr)
library(stringr)
library(glue)
library(future)
library(SummarizedExperiment)

{
    cmdopts <- get_options(commandArgs(TRUE))
    cmdopts$help <- NULL

    ## ## For testing only
    ## cmdopts <- list(
    ##     transcript_quant = "saved_data/SummarizedExperiment_rnaseq_transcript_shoal_hg38.analysisSet_knownGene.RDS",
    ##     annotation_txdb = "TxDb.Hsapiens.UCSC.hg38.knownGene",
    ##     additional_gene_info = "/home/ryan/references/hg38/genemeta.org.Hs.eg.db.RDS",
    ##     output_file = "test.rds")

    tsmsg("Args:")
    print_var_vector(cmdopts)

    ## Delete the output file if it exists
    suppressWarnings(file.remove(cmdopts$output_file))
    assert_that(!file.exists(cmdopts$output_file))

    ## Only chr1-chr22,chrX,chrY
    std.chr <- extractSeqlevels("Homo sapiens", "UCSC") %>% setdiff("chrM")

    tsmsg("Reading quantification data")
    sexp <- readRDS(cmdopts$transcript_quant)
    sexp %<>% keepSeqlevels(std.chr, pruning.mode = "coarse")

    tsmsg("Reading annotation data")
    txdb <- get_txdb(cmdopts$annotation_txdb)

    tsmsg("Computing average transcript abundances")
    tx <- rowRanges(sexp)
    tx$abundance <- sexp %>% assay("abundance") %>% rowMeans
    tx$GeneID <- mapIds(txdb, names(tx),  keytype = "TXNAME", column = "GENEID", multiVals = "first")

    tsmsg("Grouping transcripts by TSS and gene ID")
    tss_table <- tx %>% promoters(upstream = 0, downstream = 1) %>% as("data.frame") %>%
        filter(!is.na(GeneID)) %>%
        group_by(GeneID, seqnames, start, end, strand) %>%
        summarize(transcript = str_c(transcript, collapse = ","),
                  abundance = sum(abundance))

    tsmsg("Selecting most abundant TSS for each gene")
    abundant_tss <- tss_table %>%
        arrange(desc(abundance)) %>% filter(!duplicated(GeneID)) %>%
        as("GRanges") %>% setNames(.$GeneID) %>%
        .[unique(tss_table$GeneID)]

    if ("additional_gene_info" %in% names(cmdopts)) {
        tsmsg("Reading additional gene annotation metadata")
        additional_gene_info <- read_additional_gene_info(cmdopts$additional_gene_info)
        ## Generate empty rows for genes that don't have additional
        ## info
        genes_without_info <- setdiff(abundant_tss$GeneID, rownames(additional_gene_info))
        if (length(genes_without_info) > 0) {
            empty_row <- list(character(0)) %>% rep(ncol(additional_gene_info)) %>% setNames(colnames(additional_gene_info))
            single.val.cols <- sapply(additional_gene_info, function(x) all(lengths(x) == 1))
            for (i in seq_along(empty_row)) {
                if (single.val.cols[i]) {
                    empty_row[[i]] <- NA
                } else {
                    empty_row[[i]] <- list(logical(0)) %>% as(class(additional_gene_info[[i]]))
                }
            }
            empty_row %<>% DataFrame
            empty_gene_table <- empty_row[rep(1, length(genes_without_info)),] %>%
                set_rownames(genes_without_info)
            additional_gene_info %<>% rbind(empty_gene_table)
        }
        assert_that(all(abundant_tss$GeneID %in% rownames(additional_gene_info)))
        mcols(abundant_tss)[colnames(additional_gene_info)] <- additional_gene_info[abundant_tss$GeneID,]
        metadata(abundant_tss) %<>% c(metadata(additional_gene_info))
    }

    tsmsg("Saving output file")
    save.RDS.or.RDA(abundant_tss, cmdopts$output_file)
    invisible(NULL)
}

R dplyr stringr optparse magrittr GLUE SummarizedExperiment assertthat future getopt rctutils rex From line 3 of scripts/select-abundant-tss.R

library(getopt)
library(glue)
library(optparse)
library(stringr)
library(assertthat)
library(SummarizedExperiment)
library(rctutils)

get_options <- function(opts) {
    optlist <- list(
        make_option(c("-i", "--input-file"), metavar = "FILENAME.RDS", type = "character",
                    help = "(REQUIRED) Input file name. This should be an RDS file containing a SummarizedExperiment object, whose containing the counts will be saved here using saveRDS, so it should end in '.RDS'."),
        make_option(c("-o", "--output-file-pattern"), metavar = "TEMPLATE.RDS", type = "character",
                    help = "(REQUIRED) Output file name pattern. This should contain one or more column names from the sample metadata enclosed in curly braces. For example: 'chipseq-counts-{chip_antibody}.RDS'. These will be filled in for each sample based on that sample's metadata, and the samples will be split into those files accordingly. Each SummarizedExperiment object will be saved using saveRDS, so it should end in '.RDS'."))
    progname <- na.omit(c(get_Rscript_filename(), "sexp-split.R"))[1]
    parser <- OptionParser(
        usage = "Usage: %prog -i INFILE.RDS -o OUTTEMPLATE.RDS",
        description = "Split SummarizedExperiment file by metadata",
        option_list = optlist,
        add_help_option = TRUE,
        prog = progname)

    cmdopts <- parse_args(parser, opts)
    cmdopts$help <- NULL
    ## Ensure that all required arguments were provided
    required.opts <- c("input-file", "output-file-pattern")
    missing.opts <- setdiff(required.opts, names(cmdopts))
    if (length(missing.opts) > 0) {
        stop(str_c("Missing required arguments: ", deparse(missing.opts)))
    }
    assert_that(str_detect(cmdopts[['output-file-pattern']], c("\\{.*\\}")))
    ## Replace dashes with underscores so that all options can easily
    ## be accessed by "$"
    cmdopts %>% setNames(chartr("-", "_", names(.)))
}

{
    cmdopts <- get_options(commandArgs(TRUE))
    ## TODO eliminate setwd
    tryCatch(setwd(file.path(dirname(na.omit(get_Rscript_filename())), "..")),
             error = function(...) tsmsg("WARNING: Could not determine script path. Ensure that you are already in the correct directory."))

    tsmsg("Args:")
    print_var_vector(cmdopts)

    tsmsg("Reading SummarizedExperiment file")
    sexp <- read_RDS_or_RDA(cmdopts$input_file, "SummarizedExperiment")

    output_filenames = glue_data(as.list(colData(sexp)), cmdopts$output_file_pattern)
    output_groups <- split(seq_len(ncol(sexp)), output_filenames)
    output_sexps <- lapply(output_groups, . %>% sexp[,.])
    for (fname in names(output_sexps)) {
        tsmsg("Writing ", fname)
        save_RDS_or_RDA(output_sexps[[fname]], fname)
    }
}

R stringr optparse GLUE SummarizedExperiment assertthat getopt rctutils From line 3 of scripts/split-sexp.R

shell: 'scripts/get-sra-run-files.R {wildcards.sra_run:q}'

SnakeMake From line 737 of master/Snakefile

shell:'''
echo "Dumping fastq for {wildcards.sra_run:q}..."
fastq-dump --stdout {input:q} | \
  scripts/fill-in-empty-fastq-qual.py \
  > {output.temp_unshuffled:q}
echo "Shuffling fastq for {wildcards.sra_run:q}..."
fastq-sort --random --seed=1986 {output.temp_unshuffled:q} > {output.temp_shuffled:q}
echo "Compressing fastq for {wildcards.sra_run:q}..."
{params.compress_cmd} < {output.temp_shuffled:q} > {output:q}
rm -f {output.temp_unshuffled:q} {output.temp_shuffled:q}
'''

SnakeMake From line 763 of master/Snakefile

shell: '''
STAR \
  --runThreadN {threads:q} \
  --runMode alignReads \
  --genomeDir {params.index_genomedir:q} \
  --sjdbGTFfile {input.transcriptome_gff:q} \
  --sjdbGTFfeatureExon CDS \
  --sjdbGTFtagExonParentTranscript Parent \
  --sjdbGTFtagExonParentGene gene_id \
  --sjdbOverhang 100 \
  --readFilesIn {input.fastq:q} \
  --readFilesCommand {params.read_cmd:q} \
  --outSAMattributes Standard \
  --outSAMunmapped Within \
  --outFileNamePrefix {params.outdir:q} \
  --outSAMtype SAM
'''

SnakeMake STAR From line 798 of master/Snakefile

shell: '''
picard-tools SortSam \
  I={input.sam:q} O={output.bam:q} \
  SORT_ORDER=coordinate VALIDATION_STRINGENCY=LENIENT
'''

SnakeMake From line 821 of master/Snakefile

run:
    index_basename = re.sub('\\.1\\.ht2', '', input.index_f1)
    outdir = os.path.dirname(output.bam)
    cmds = [
        [
            'hisat2',
            '--threads', str(threads),
            '-q', '--phred33',
            '--very-sensitive',
            '--dta-cufflinks',
            '-x', index_basename,
            '-U', input.fastq,
            '-k', '20',
            '--time',
        ],
        [
            # Convert to UCSC chromosome names
            'scripts/bam-rename-chroms.py', input.chrom_mapping,
        ],
        [
            'picard-tools', 'SortSam', 'I=/dev/stdin', 'O=/dev/stdout',
            'SORT_ORDER=coordinate', 'VALIDATION_STRINGENCY=LENIENT',
        ]
    ]
    with atomic_write(output.bam, mode='wb', overwrite=True) as outfile, \
         open(log[0], mode='wb') as logfile:
        pipeline = Popen_pipeline(cmds, stdout=outfile, stderr=logfile)
        wait_for_subprocs(pipeline)

SnakeMake HISAT2 From line 842 of master/Snakefile

shell: '''
picard-tools BuildBamIndex I={input:q} O={output:q} \
    VALIDATION_STRINGENCY=LENIENT
'''

SnakeMake From line 885 of master/Snakefile

shell: '''
picard-tools BuildBamIndex I={input:q} O={output:q} \
    VALIDATION_STRINGENCY=LENIENT
'''

SnakeMake From line 902 of master/Snakefile

shell: '''
bedtools bamtobed -i {input:q} > {output:q}
'''

SnakeMake BEDTools From line 916 of master/Snakefile

shell: '''
macs2 filterdup --ifile {input:q} --format BAM \
  --gsize hs --keep-dup auto \
  --ofile {output.bed:q} \
  2>&1 | tee {log:q} 1>&2
'''

SnakeMake macs2 From line 937 of master/Snakefile

shell: '''
scripts/rnaseq-count.R \
  --samplemeta-file {input.samplemeta:q} \
  --sample-id-column SRA_run \
  --bam-file-pattern {params.bam_file_pattern:q} \
  --output-file {output.sexp:q} \
  --expected-bam-files {params.expected_bam_files:q} \
  --threads {threads:q} \
  --annotation-txdb {input.txdb:q} \
  --additional-gene-info {input.genemeta:q}
'''

SnakeMake From line 972 of master/Snakefile

shell: '''
scripts/rnaseq-count.R \
  --samplemeta-file {input.samplemeta:q} \
  --sample-id-column SRA_run \
  --bam-file-pattern {params.bam_file_pattern:q} \
  --output-file {output.sexp:q} \
  --expected-bam-files {params.expected_bam_files:q} \
  --threads {threads:q} \
  --annotation-txdb 'TxDb.Hsapiens.UCSC.hg38.knownGene' \
  --additional-gene-info {input.genemeta:q}
'''

SnakeMake From line 1008 of master/Snakefile

shell: '''
scripts/rnaseq-count.R \
  --samplemeta-file {input.samplemeta:q} \
  --sample-id-column SRA_run \
  --bam-file-pattern {params.bam_file_pattern:q} \
  --output-file {output.sexp:q} \
  --expected-bam-files {params.expected_bam_files:q} \
  --threads {threads:q} \
  --annotation-txdb {input.txdb:q} \
  --additional-gene-info {input.genemeta:q}
'''

SnakeMake From line 1045 of master/Snakefile

shell: '''
scripts/rnaseq-count.R \
  --samplemeta-file {input.samplemeta:q} \
  --sample-id-column SRA_run \
  --bam-file-pattern aligned/rnaseq_star_hg38.analysisSet_knownGene/%s/Aligned.sortedByCoord.out.bam \
  --output-file {output.sexp:q} \
  --expected-bam-files {params.expected_bam_files:q} \
  --threads {threads:q} \
  --annotation-txdb TxDb.Hsapiens.UCSC.hg38.knownGene \
  --additional-gene-info {input.genemeta:q}
'''

SnakeMake From line 1079 of master/Snakefile

shell: '''
timeout 1h \
  salmon quant \
  --index {params.index_dir:q} \
  --libType {params.libtype:q} \
  --unmatedReads {input.fastq:q} \
  --threads {threads:q} \
  --seqBias --gcBias --useVBOpt \
  --dumpEq --dumpEqWeights \
  --geneMap {input.genemap_file:q} \
  --output {params.outdir:q} \
  --auxDir aux_info \
  --numGibbsSamples 100
'''

SnakeMake Quant Salmon From line 1110 of master/Snakefile

shell: ''' scripts/convert-salmon-to-hdf5.R {wildcards.salmon_quant_dir:q} '''

SnakeMake From line 1133 of master/Snakefile

shell: '''
run_shoal.sh -j {threads:q} -q {params.quantdir:q} -o {params.outdir:q}
'''

SnakeMake From line 1149 of master/Snakefile

run:
    libType = list(rnaseq_samplemeta['libType'][rnaseq_samplemeta['SRA_run'] == wildcards.SRA_run])[0]
    if libType == 'SF':
        lib_opt = '--fr-stranded'
    elif libType == 'SR':
        lib_opt = '--rf-stranded'
    else:
        raise ValueError('Unknown kallisto libtype: {}'.format(libType))
    shell('''
    mkdir -p {params.outdir:q}
    kallisto quant \
      --index {input.kallisto_index:q} --output-dir {params.outdir:q} \
      {lib_opt:q} --single --threads {threads:q} --bootstrap-samples 100 \
      --bias --fragment-length 200 --sd 80 {input.fastq:q}
    ''')

SnakeMake Quant kallisto From line 1170 of master/Snakefile

shell: '''
bowtie2 --threads {threads:q} --mm \
  -U {input.fastq:q} -x {params.index_basename:q} -q \
  --end-to-end --sensitive | \
picard-tools SortSam I=/dev/stdin O={output.bam:q} \
  SORT_ORDER=coordinate VALIDATION_STRINGENCY=LENIENT
'''

SnakeMake Bowtie 2 From line 1205 of master/Snakefile

shell: 'zcat {input:q} > {output:q}'

SnakeMake From line 1221 of master/Snakefile

shell: '''tar -O -xjf {input.tar:q} {params.file_inside_tar:q} > {output.bed:q}'''

SnakeMake From line 1232 of master/Snakefile

script: 'scripts/liftOver-MotifMap.R'

SnakeMake From line 1240 of master/Snakefile

script: 'scripts/get-CpG.R'

SnakeMake From line 1244 of master/Snakefile

shell: '''zcat {input:q} > {output:q}'''

SnakeMake From line 1255 of master/Snakefile

shell: '''liftOver {input.bed:q} {input.chain:q} {output.bed:q} /dev/null'''

SnakeMake ucsc-liftover From line 1266 of master/Snakefile

shell: '''MC_CORES={threads:q} scripts/generate-greylists.R'''

SnakeMake From line 1291 of master/Snakefile

shell: '''cat {input:q} > {output:q}'''

SnakeMake From line 1301 of master/Snakefile

shell: '''
scripts/generate-promoters.R \
  --txdb {input.txdb:q} \
  --promoter-radius {wildcards.radius:q} \
  --additional-gene-info {input.genemeta:q} \
  --output-file {output.rds:q}
'''

SnakeMake From line 1310 of master/Snakefile

shell: '''
scripts/generate-promoters.R \
  --txdb {params.txdb:q} \
  --promoter-radius {wildcards.radius:q} \
  --additional-gene-info {input.genemeta:q} \
  --output-file {output.rds:q}
'''

SnakeMake From line 1326 of master/Snakefile

run:
    rfile_basename = os.path.basename(output.rfile)
    rfile_dirname = os.path.dirname(output.rfile)
    shell('''
    macs2 predictd -i {input.bam_files:q} -f BAM -g hs \
      --outdir {rfile_dirname:q} --rfile {rfile_basename:q} \
      &>{output.logfile:q}
    cd plots
    Rscript ../{rfile_dirname:q}/{rfile_basename:q}
    ''')

SnakeMake macs2 From line 1348 of master/Snakefile

shell: '''
macs2 callpeak \
  --treatment {input.chip_pulldown:q} \
  --control {input.chip_input:q} \
  --format BAM \
  --gsize hs \
  --keep-dup auto \
  --outdir {params.outdir:q} \
  --name peakcall \
  --nomodel --extsize 147 \
  --pvalue=0.5 \
  2>&1 | tee {log:q};
prename 's/peakcall_//' {params.outdir:q}/*
'''

SnakeMake macs2 From line 1383 of master/Snakefile

shell: '''
macs2 callpeak \
  --treatment {input.chip_pulldown:q} \
  --control {input.chip_input:q} \
  --format BAM \
  --gsize hs \
  --keep-dup auto \
  --outdir {params.outdir:q} \
  --name peakcall \
  --bdg \
  --nomodel --extsize 147 \
  --pvalue=0.5 \
  2>&1 | tee {log:q}
prename 's/peakcall_//' {params.outdir:q}/*
'''

SnakeMake macs2 From line 1423 of master/Snakefile

shell: '''
macs2 callpeak \
  --treatment {input.chip_pulldown:q} \
  --control {input.chip_input:q} \
  --format BAM \
  --gsize hs \
  --keep-dup auto \
  --outdir {params.outdir:q} \
  --name peakcall \
  --bdg \
  --nomodel --extsize 147 \
  --pvalue=0.5 \
  2>&1 | tee {log:q}
prename 's/peakcall_//' {params.outdir:q}/*
'''

SnakeMake macs2 From line 1465 of master/Snakefile

shell: '''
macs2 callpeak \
  --treatment {input.chip_pulldown:q} \
  --control {input.chip_input:q} \
  --format BAM \
  --gsize hs \
  --keep-dup auto \
  --outdir {params.outdir:q} \
  --name peakcall \
  --nomodel --extsize 147 \
  --pvalue=0.5 \
  2>&1 | tee {log:q}
prename 's/peakcall_//' {params.outdir:q}/*
'''

SnakeMake macs2 From line 1508 of master/Snakefile

run:
    with open(output.peaks, 'wb') as outfile, open(log[0], 'wb') as logfile:
        cmd = ['epic'] + \
              ['--treatment'] + input.chip_pulldown + \
              ['--control'] + input.chip_input + \
              [
                  '--number-cores', threads,
                  '--genome', 'hg38',
                  '--fragment-size', 147,
                  '--keep-duplicates', 'True',
                  '--bigwig', os.path.join(params.outdir, 'bigwig'),
              ]
        cmd = [str(x) for x in cmd]
        p = Popen(cmd, stdout=outfile, stderr=PIPE)
        for logline in p.stderr:
            logfile.write(logline)
            sys.stderr.write(logline.decode(sys.getdefaultencoding()))

SnakeMake From line 1550 of master/Snakefile

run:
    with open(output.peaks, 'wb') as outfile, open(log[0], 'wb') as logfile:
        cmd = ['epic'] + \
              ['--treatment'] + input.chip_pulldown + \
              ['--control'] + input.chip_input + \
              [
                  '--number-cores', threads,
                  '--genome', 'hg38',
                  '--fragment-size', 147,
                  '--keep-duplicates', 'True',
                  '--bigwig', os.path.join(params.outdir, 'bigwig'),
              ]
        cmd = [str(x) for x in cmd]
        p = Popen(cmd, stdout=outfile, stderr=PIPE)
        for logline in p.stderr:
            logfile.write(logline)
            sys.stderr.write(logline.decode(sys.getdefaultencoding()))

SnakeMake From line 1596 of master/Snakefile

run:
    with open(output.peaks, 'wb') as outfile, open(log[0], 'wb') as logfile:
        cmd = ['epic'] + \
              ['--treatment'] + input.chip_pulldown + \
              ['--control'] + input.chip_input + \
              [
                  '--number-cores', threads,
                  '--genome', 'hg38',
                  '--fragment-size', 147,
                  '--keep-duplicates', 'True',
                  '--bigwig', os.path.join(params.outdir, 'bigwig'),
              ]
        cmd = [str(x) for x in cmd]
        p = Popen(cmd, stdout=outfile, stderr=PIPE)
        for logline in p.stderr:
            logfile.write(logline)
            sys.stderr.write(logline.decode(sys.getdefaultencoding()))

SnakeMake From line 1643 of master/Snakefile

run:
    with open(output.peaks, 'wb') as outfile, open(log[0], 'wb') as logfile:
        cmd = ['epic'] + \
              ['--treatment'] + input.chip_pulldown + \
              ['--control'] + input.chip_input + \
              [
                  '--number-cores', threads,
                  '--genome', 'hg38',
                  '--fragment-size', 147,
                  '--keep-duplicates', 'True',
                  '--bigwig', os.path.join(params.outdir, 'bigwig'),
              ]
        cmd = [str(x) for x in cmd]
        p = Popen(cmd, stdout=outfile, stderr=PIPE)
        for logline in p.stderr:
            logfile.write(logline)
            sys.stderr.write(logline.decode(sys.getdefaultencoding()))

SnakeMake From line 1691 of master/Snakefile

run:
    peaks = pd.DataFrame.from_csv(input[0], header=1, sep=' ', index_col=None)
    ndigits = ceil(log10(peaks.shape[0]+1))
    name_format = 'epic_peak_{{:0{}d}}'.format(ndigits)
    tiny_float = np.finfo(float).tiny
    narrowpeak = pd.DataFrame.from_items((
        ('chrom', peaks['Chromosome']),
        ('chromStart', peaks['Start']),
        ('chromEnd', peaks['End']),
        ('name', pd.Series(name_format.format(x) for x in range(peaks.shape[0]))),
        ('score', peaks['Score']),
        ('strand', '.'),
        ('signalValue', peaks['Fold_change']),
        ('pValue', -np.log10(peaks['P'].where(peaks['P'] > tiny_float, other=tiny_float))),
        ('qValue', -np.log10(peaks['FDR'].where(peaks['FDR'] > tiny_float, other=tiny_float))),
        # Epic doesn't call a peak, so just use the midpoint
        ('peak', np.array(np.around((peaks['End'] - peaks['Start']) / 2), dtype=np.int64)),
    ))
    narrowpeak.to_csv(output[0], sep='\t',
                      header=False, index=False,
                      quoting=csv.QUOTE_NONE,)

SnakeMake From line 1720 of master/Snakefile

shell: '''
bedtools subtract -A -a {input.peaks:q} -b {input.blacklist:q} > {output.peaks:q}
if [ ! -s {output.peaks:q} ]; then
  rm -f {output.peaks:q}
fi
'''

SnakeMake BEDTools From line 1750 of master/Snakefile

run:
    pick_top_peaks(input.donorA_peaks, output.temp_donorA_peaks, by='pValue', number=150000)
    pick_top_peaks(input.donorB_peaks, output.temp_donorB_peaks, by='pValue', number=150000)
    shell('''
    idr --samples {output.temp_donorA_peaks:q} {output.temp_donorB_peaks:q} \
      --input-file-type narrowPeak \
      --rank p.value \
      --output-file {output.outfile:q} \
      --output-file-type bed \
      --log-output-file {log:q} \
      --plot \
      --random-seed 1986
    mv {output.outfile:q}.png {output.plotfile:q}
    ''')

SnakeMake From line 1774 of master/Snakefile

run:
    pick_top_peaks(input.donorA_peaks, output.temp_donorA_peaks, by='pValue', number=150000)
    pick_top_peaks(input.donorB_peaks, output.temp_donorB_peaks, by='pValue', number=150000)
    shell('''
    idr --samples {output.temp_donorA_peaks:q} {output.temp_donorB_peaks:q} \
      --input-file-type narrowPeak \
      --rank p.value \
      --output-file {output.outfile:q} \
      --output-file-type bed \
      --log-output-file {log:q} \
      --plot \
      --random-seed 1986
    mv {output.outfile:q}.png {output.plotfile:q}
    ''')

SnakeMake From line 1805 of master/Snakefile

run:
    pick_top_peaks(input.donorA_peaks, output.temp_donorA_peaks, by='score', number=150000)
    pick_top_peaks(input.donorB_peaks, output.temp_donorB_peaks, by='score', number=150000)
    shell('''
    idr --samples {output.temp_donorA_peaks:q} {output.temp_donorB_peaks:q} \
      --input-file-type bed \
      --rank score \
      --output-file {output.outfile:q} \
      --output-file-type bed \
      --log-output-file {log:q} \
      --plot \
      --random-seed 1986
    mv {output.outfile:q}.png {output.plotfile:q}
    ''')

SnakeMake From line 1836 of master/Snakefile

run:
    pick_top_peaks(input.donorA_peaks, output.temp_donorA_peaks, by='score', number=150000)
    pick_top_peaks(input.donorB_peaks, output.temp_donorB_peaks, by='score', number=150000)
    shell('''
    idr --samples {output.temp_donorA_peaks:q} {output.temp_donorB_peaks:q} \
      --input-file-type bed \
      --rank score \
      --output-file {output.outfile:q} \
      --output-file-type bed \
      --log-output-file {log:q} \
      --plot \
      --random-seed 1986
    mv {output.outfile:q}.png {output.plotfile:q}
    ''')

SnakeMake From line 1867 of master/Snakefile

shell: '''
scripts/plot-idr.R -i {input:q} -o {output:q} \
  -A {params.sampleA:q} -B {params.sampleB:q} \
  -P {params.common_prefix:q}
'''

SnakeMake From line 1893 of master/Snakefile

run:
    idr_results = ','.join(input.idr_results_files)
    shell('''scripts/filter-by-idr.R -p {input.combined_peaks:q} -o {output.filtered_peaks:q} -i {idr_results:q} -r''')

SnakeMake From line 1911 of master/Snakefile

run:
    idr_results = ','.join(input.idr_results_files)
    shell('''scripts/filter-by-idr.R -p {input.combined_peaks:q} -o {output.filtered_peaks:q} -i {idr_results:q} -r''')

SnakeMake From line 1926 of master/Snakefile

shell: 'MC_CORES={threads:q} scripts/chipseq-compute-ccf.R'

SnakeMake From line 1946 of master/Snakefile

shell: 'scripts/chipseq-plot-ccf.R'

SnakeMake From line 1963 of master/Snakefile

shell: 'MC_CORES={threads:q} scripts/chipseq-profile-sites.R'

SnakeMake From line 1983 of master/Snakefile

shell: '''
scripts/chipseq-count-windows.R \
  --samplemeta-file {input.samplemeta:q} \
  --sample-id-column SRA_run \
  --bam-file-pattern 'aligned/chipseq_bowtie2_hg38.analysisSet/%s/Aligned.bam' \
  --window-width {wildcards.window_size:q} \
  --read-extension {wildcards.read_ext:q} \
  --blacklist {input.blacklist:q} \
  --threads {threads:q} \
  --output-file {output:q}
'''

SnakeMake From line 2002 of master/Snakefile

shell: '''
scripts/chipseq-count-windows.R \
  --samplemeta-file {input.samplemeta:q} \
  --sample-id-column SRA_run \
  --bam-file-pattern 'aligned/chipseq_bowtie2_hg38.analysisSet/%s/Aligned.bam' \
  --window-width {wildcards.window_size:q} \
  --blacklist {input.blacklist:q} \
  --bin \
  --threads {threads:q} \
  --output-file {output:q}
'''

SnakeMake From line 2031 of master/Snakefile

shell: '''
scripts/chipseq-count-regions.R \
  --samplemeta-file {input.samplemeta:q} \
  --sample-id-column SRA_run \
  --bam-file-pattern 'aligned/chipseq_bowtie2_hg38.analysisSet/{{SAMPLE}}/Aligned.bam' \
  --regions {input.promoters:q} \
  --read-extension {wildcards.read_ext:q} \
  --blacklist {input.blacklist:q} \
  --threads {threads:q} \
  --output-file {output:q}
'''

SnakeMake From line 2061 of master/Snakefile

shell: '''
scripts/chipseq-count-regions.R \
  --samplemeta-file {input.samplemeta:q} \
  --sample-id-column SRA_run \
  --filter-sample-ids={params.sample_id_list:q} \
  --bam-file-pattern 'aligned/chipseq_bowtie2_hg38.analysisSet/{{SAMPLE}}/Aligned.bam' \
  --regions {input.peaks:q} \
  --read-extension {wildcards.read_ext:q} \
  --blacklist {input.blacklist:q} \
  --threads {threads:q} \
  --output-file {output:q}
'''

SnakeMake From line 2095 of master/Snakefile

shell: '''
scripts/chipseq-count-neighborhoods.R \
  --samplemeta-file {input.samplemeta:q} \
  --sample-id-column SRA_run \
  --bam-file-pattern 'aligned/chipseq_bowtie2_hg38.analysisSet/{{SAMPLE}}/Aligned.bam' \
  --targets {input.tss:q} \
  --read-extension {wildcards.read_ext:q} \
  --blacklist {input.blacklist:q} \
  --threads {threads:q} \
  --upstream-neighborhood {wildcards.radius:q} \
  --downstream-neighborhood {wildcards.radius:q} \
  --window-width {wildcards.wsize:q} \
  --initial-window-offset 0 \
  --blacklist-action mark \
  --output-file {output:q}
'''

SnakeMake From line 2126 of master/Snakefile

shell: '''
scripts/split-sexp.R \
  -i {input:q} \
  -o 'saved_data/chipseq-counts_{wildcards.window_size:q}-windows_{wildcards.read_ext:q}-reads_{{chip_antibody}}.RDS'
'''

SnakeMake From line 2155 of master/Snakefile

shell: '''
scripts/split-sexp.R \
  -i {input:q} \
  -o 'saved_data/chipseq-counts_{wildcards.window_size:q}-bigbins_{{chip_antibody}}.RDS'
'''

SnakeMake From line 2173 of master/Snakefile

shell: '''
scripts/split-sexp.R \
  -i {input:q} \
  -o 'saved_data/promoter-counts_{wildcards.base:q}_{wildcards.read_ext:q}-reads_{{chip_antibody}}.RDS'
'''

SnakeMake From line 2191 of master/Snakefile

shell: '''
scripts/split-sexp.R \
  -i {input:q} \
  -o 'saved_data/tss-neighborhood-counts_{wildcards.base:q}_{wildcards.read_ext:q}-reads_{{chip_antibody}}.RDS'
'''

SnakeMake From line 2209 of master/Snakefile

run:
    cmd = [
        'scripts/convert-quant-to-sexp.R',
        '--samplemeta-file', input.samplemeta,
        '--sample-id-column', 'SRA_run',
        '--abundance-file-pattern', *expand('{quantifier}_quant/hg38.analysisSet_ensembl.{release}/{{SAMPLE}}/abundance.h5', **wildcards),
        '--output-file', output.sexp,
        '--expected-abundance-files', ','.join(input.samples),
        '--aggregate-level', 'gene',
        '--annotation-txdb', input.txdb,
        '--gene-info', input.genemeta,
    ]
    check_call(cmd)

SnakeMake From line 2229 of master/Snakefile

run:
    cmd = [
        'scripts/convert-quant-to-sexp.R',
        '--samplemeta-file', input.samplemeta,
        '--sample-id-column', 'SRA_run',
        '--abundance-file-pattern', *expand('{quantifier}_quant/hg38.analysisSet_knownGene/{{SAMPLE}}/abundance.h5', **wildcards),
        '--output-file', output.sexp,
        '--expected-abundance-files', ','.join(input.samples),
        '--aggregate-level', 'gene',
        '--annotation-txdb', 'TxDb.Hsapiens.UCSC.hg38.knownGene',
        '--gene-info', input.genemeta,
    ]
    check_call(cmd)

SnakeMake From line 2256 of master/Snakefile

run:
    cmd = [
        'scripts/convert-shoal-to-sexp.R',
        '--samplemeta-file', input.samplemeta,
        '--sample-id-column', 'SRA_run',
        '--shoal-dir', *expand('shoal_quant/hg38.analysisSet_ensembl.{release}', **wildcards),
        '--output-file', output.sexp,
        '--aggregate-level', 'gene',
        '--annotation-txdb', input.txdb,
        '--gene-info', input.genemeta,
    ]
    check_call(cmd)

SnakeMake From line 2284 of master/Snakefile

run:
    cmd = [
        'scripts/convert-shoal-to-sexp.R',
        '--samplemeta-file', input.samplemeta,
        '--sample-id-column', 'SRA_run',
        '--shoal-dir', *expand('shoal_quant/hg38.analysisSet_knownGene', **wildcards),
        '--output-file', output.sexp,
        '--aggregate-level', 'gene',
        '--annotation-txdb', 'TxDb.Hsapiens.UCSC.hg38.knownGene',
        '--gene-info', input.genemeta,
    ]
    check_call(cmd)

SnakeMake From line 2310 of master/Snakefile

run:
    cmd = [
        'scripts/convert-quant-to-sexp.R',
        '--samplemeta-file', input.samplemeta,
        '--sample-id-column', 'SRA_run',
        '--abundance-file-pattern', *expand('{quantifier}_quant/hg38.analysisSet_ensembl.{release}/{{SAMPLE}}/abundance.h5', **wildcards),
        '--output-file', output.sexp,
        '--expected-abundance-files', ','.join(input.samples),
        '--aggregate-level', 'transcript',
        '--annotation-txdb', input.txdb,
    ]
    check_call(cmd)

SnakeMake From line 2336 of master/Snakefile

run:
    cmd = [
        'scripts/convert-quant-to-sexp.R',
        '--samplemeta-file', input.samplemeta,
        '--sample-id-column', 'SRA_run',
        '--abundance-file-pattern', *expand('{quantifier}_quant/hg38.analysisSet_knownGene/{{SAMPLE}}/abundance.h5', **wildcards),
        '--output-file', output.sexp,
        '--expected-abundance-files', ','.join(input.samples),
        '--aggregate-level', 'transcript',
        '--annotation-txdb', 'TxDb.Hsapiens.UCSC.hg38.knownGene',
    ]
    check_call(cmd)

SnakeMake From line 2361 of master/Snakefile

run:
    cmd = [
        'scripts/convert-shoal-to-sexp.R',
        '--samplemeta-file', input.samplemeta,
        '--sample-id-column', 'SRA_run',
        '--shoal-dir', *expand('shoal_quant/hg38.analysisSet_ensembl.{release}', **wildcards),
        '--output-file', output.sexp,
        '--aggregate-level', 'transcript',
        '--annotation-txdb', input.txdb,
    ]
    check_call(cmd)

SnakeMake From line 2387 of master/Snakefile

run:
    cmd = [
        'scripts/convert-shoal-to-sexp.R',
        '--samplemeta-file', input.samplemeta,
        '--sample-id-column', 'SRA_run',
        '--shoal-dir', *expand('shoal_quant/hg38.analysisSet_knownGene', **wildcards),
        '--output-file', output.sexp,
        '--aggregate-level', 'transcript',
        '--annotation-txdb', 'TxDb.Hsapiens.UCSC.hg38.knownGene',
    ]
    check_call(cmd)

SnakeMake From line 2411 of master/Snakefile

run:
    os.environ['MC_CORES'] = str(threads)
    rmd_render(input = input.rmd, output_file = os.path.join(os.getcwd(), output.html),
               output_format = 'html_notebook',
               params = {
                   'quant_method': wildcards.quant_method,
                   'genome': wildcards.genome,
                   'transcriptome': wildcards.transcriptome,
               })

SnakeMake From line 2439 of master/Snakefile

run:
    os.environ['MC_CORES'] = str(threads)
    rmd_render(input=input.rmd, output_file=os.path.join(os.getcwd(), output.html),
               output_format='html_notebook')

SnakeMake From line 2462 of master/Snakefile

run:
    os.environ['MC_CORES'] = str(threads)
    rmd_render(input=input.rmd, output_file=os.path.join(os.getcwd(), output.html),
               output_format='html_notebook',
               params={
                   'quant_method': wildcards.quant_method,
                   'genome': wildcards.genome,
                   'transcriptome': wildcards.transcriptome,
               })

SnakeMake From line 2480 of master/Snakefile

run:
    os.environ['MC_CORES'] = str(threads)
    rmd_render(input=input.rmd, output_file=os.path.join(os.getcwd(), output.html),
               output_format='html_notebook')

SnakeMake From line 2499 of master/Snakefile

run:
    os.environ['MC_CORES'] = str(threads)
    rmd_render(input=input.rmd, output_file=os.path.join(os.getcwd(), output.html),
               output_format='html_notebook',
               params={
                   'histone_mark': wildcards.chip_antibody,
                   'window_size': '500bp',
                   'fragment_length': '147bp',
                   'bigbin_size': '10kbp',
               })

SnakeMake From line 2516 of master/Snakefile

run:
    os.environ['MC_CORES'] = str(threads)
    rmd_render(input=input.rmd, output_file=os.path.join(os.getcwd(), output.html),
               output_format='html_notebook',
               params={
                   'histone_mark': wildcards.chip_antibody,
                   'window_size': '500bp',
                   'fragment_length': '147bp',
               })

SnakeMake From line 2542 of master/Snakefile

run:
    rmd_render(input=input.rmd, output_file=os.path.join(os.getcwd(), output.html))

SnakeMake From line 2563 of master/Snakefile

run:
    os.environ['MC_CORES'] = str(threads)
    rmd_render(input=input.rmd, output_file=os.path.join(os.getcwd(), output.html),
               output_format='html_notebook',
               params={
                   'genome': wildcards.genome,
                   'transcriptome': wildcards.transcriptome,
                   'histone_mark': wildcards.chip_antibody,
                   'promoter_radius': wildcards.promoter_radius,
                   'fragment_length': '147bp',
                   'bigbin_size': '10kbp',
               })

SnakeMake From line 2576 of master/Snakefile

run:
    os.environ['MC_CORES'] = str(threads)
    rmd_render(input=input.rmd,
               output_file=os.path.join(os.getcwd(), output.html),
               output_format='html_notebook',
               params={
                   'genome': wildcards.genome,
                   'transcriptome': wildcards.transcriptome,
                   'histone_mark': wildcards.chip_antibody,
                   'promoter_radius': wildcards.promoter_radius,
                   'fragment_length': '147bp',
               })

SnakeMake From line 2602 of master/Snakefile

run:
    rmd_render(input=input.rmd,
               output_file=os.path.join(os.getcwd(), output.html),
               output_format='html_notebook')

SnakeMake From line 2626 of master/Snakefile

shell: '''cp {input:q} {output:q}'''

SnakeMake From line 2649 of master/Snakefile

shell: '''zcat {input:q} | perl -lane 'print if $A[4] >= {wildcards.score_threshold};' > {output:q}'''

SnakeMake From line 2654 of master/Snakefile

script: 'scripts/get-tfbs-conserved.R'

SnakeMake From line 2665 of master/Snakefile

run:
    os.environ['MC_CORES'] = str(threads)
    rmd_render(input=input.rmd,
               output_file=os.path.join(os.getcwd(), output.html),
               output_format='html_notebook')

SnakeMake From line 2685 of master/Snakefile

run:
    os.environ['MC_CORES'] = str(threads)
    rmd_render(input=input.rmd,
               output_file=os.path.join(os.getcwd(), output.html),
               output_format='html_notebook')

SnakeMake From line 2707 of master/Snakefile

run:
    os.environ['MC_CORES'] = str(threads)
    rmd_render(input=input.rmd,
               output_file=os.path.join(os.getcwd(), output.html),
               output_format='html_notebook')

SnakeMake From line 2729 of master/Snakefile

run:
    os.environ['MC_CORES'] = str(threads)
    rmd_render(input=input.rmd,
               output_file=os.path.join(os.getcwd(), output.html),
               output_format='html_notebook')

SnakeMake From line 2744 of master/Snakefile

run:
    os.environ['MC_CORES'] = str(threads)
    rmd_render(input=input.rmd,
               output_file=os.path.join(os.getcwd(), output.html),
               output_format='html_notebook')

SnakeMake From line 2759 of master/Snakefile

shell: '''Rscript scripts/prepare-msigdb.R'''

SnakeMake From line 2777 of master/Snakefile

shell: '''Rscript scripts/prepare-graphite.R'''

SnakeMake From line 2787 of master/Snakefile

run:
    os.environ['MC_CORES'] = str(threads)
    rmd_run_without_rendering(input.rmd)

SnakeMake From line 2805 of master/Snakefile

run:
    os.environ['MC_CORES'] = str(threads)
    rmd_run_without_rendering(
        input.rmd,
        params={
            'genome': 'hg38.analysisSet',
            'transcriptome': 'ensembl.85',
            'histone_mark': wildcards.histone_mark,
            'promoter_radius': wildcards.promoter_radius,
            'fragment_length': '147bp',
        })

SnakeMake From line 2826 of master/Snakefile

shell: '''
Rscript scripts/select-abundant-tss.R \
  --transcript-quant {input.sexp:q} \
  --annotation-txdb {input.txdb:q} \
  --additional-gene-info {input.genemeta:q} \
  --output-file {output.tss:q}
'''

SnakeMake From line 2846 of master/Snakefile

shell: '''
Rscript scripts/select-abundant-tss.R \
  --transcript-quant {input.sexp:q} \
  --annotation-txdb 'TxDb.Hsapiens.UCSC.hg38.knownGene' \
  --additional-gene-info {input.genemeta:q} \
  --output-file {output.tss:q}
'''

SnakeMake From line 2861 of master/Snakefile

run:
    os.environ['MC_CORES'] = str(threads)
    rmd_render(input=input.rmd, output_file=os.path.join(os.getcwd(), output.html),
               output_format='html_notebook',
               params={
                   'genome': wildcards.genome,
                   'transcriptome': wildcards.transcriptome,
                   'histone_mark': wildcards.chip_antibody,
                   'neighborhood_radius': wildcards.neighborhood_radius,
                   'window_size': wildcards.window_size,
                   'fragment_length': '147bp',
                   'bigbin_size': '10kbp',
               })

SnakeMake From line 2881 of master/Snakefile

run:
    os.environ['MC_CORES'] = str(threads)
    rmd_render(input=input.rmd, output_file=os.path.join(os.getcwd(), output.html),
               output_format='html_notebook',
               params={
                   'genome': wildcards.genome,
                   'transcriptome': wildcards.transcriptome,
                   'quant_method': wildcards.quant_method,
               })

SnakeMake From line 2908 of master/Snakefile

import os.path
import re
import shutil
import subprocess
import sys

def get_command_version_string(cmd, regexp, *, prefix="", suffix="", use_stderr=None,
                               encoding=sys.getdefaultencoding(), raise_on_error=False):
    '''Get the version string from a command.

    Arguments:

    cmd: a string or list of strings to run a command that will
    print a version number, typically something like 'mycommand
    --version'.

    regexp: A regular expression including a named capture group with
    a name of 'version' that captures just the version string. For
    example, if the command returns 'mycommand v1.2.3', the regexp
    might be 'v(?P<version>(\\d+\\.)*\\d+)', which would match the
    string '1.2.3' in the 'version' named capture group. If the regexp
    doesn't match the output of the command, or is missing the named
    capture group, an exception is raised.

    Keyword-only arguments:

    prefix, suffix: Prepended/appended to the version string before
    returning.

    use_stderr: If the command is known to print its version to
    standard error instead of standard output, set this to True. If it
    is known to print its version to standard output, set this to
    False. If it is unknown, leave it as None, and the concatenation
    of both (standard output first) will be searched for the version
    string.

    raise_on_error: If False (the default), will return None if an
    error is encountered, including failing to find the command or
    failing to match the regular expression. If True, the exception
    will be raised as normal.

    encoding: Which text encoding to use to read the output of the
    command. Use the system default if not specified.

    '''
    try:
        use_shell = isinstance(cmd, str)
        p = subprocess.Popen(cmd, shell=use_shell, stdin=None, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        (stdout, stderr) = p.communicate()
        if use_stderr is None:
            output = stdout + stderr
        elif use_stderr:
            output = stderr
        else:
            output = stdout
        output = output.decode(encoding)
        m = re.search(regexp, output)
        if m is None:
            raise ValueError("Regular expression did not match command output")
        return prefix + m.group("version") + suffix
    except Exception as ex:
        if raise_on_error:
            raise ex
        else:
            return None

ascp_path = shutil.which("ascp") or os.path.expanduser("~/.aspera/connect/bin/ascp")

SOFTWARE_VERSIONS = dict()

# Determine the versions of various programs used
SOFTWARE_VERSIONS['ASCP'] = get_command_version_string([ascp_path, '--version'], 'ascp version\\s+(?P<version>\\S+)', prefix='ascp ')
SOFTWARE_VERSIONS['BEDTOOLS'] = get_command_version_string('bedtools --version', 'bedtools\\s+(?P<version>\\S+)', prefix='bedtools ')
SOFTWARE_VERSIONS['BOWTIE2'] = get_command_version_string('bowtie2 --version', 'version\\s+(?P<version>\\S+)', prefix='bowtie2 ')
SOFTWARE_VERSIONS['CUFFLINKS'] = get_command_version_string('cufflinks --help', 'cufflinks v(?P<version>\S+)', prefix='cufflinks ')
SOFTWARE_VERSIONS['EPIC'] = get_command_version_string('epic --version', 'epic\\s+(?P<version>\\S+)', prefix='epic ')
SOFTWARE_VERSIONS['FASTQ_TOOLS'] = get_command_version_string('fastq-sort --version', '(?P<version>\\d+(\\.\\d+)*)', prefix='fastq-tools ')
SOFTWARE_VERSIONS['HISAT2'] = get_command_version_string('hisat2 --version', 'version\\s+(?P<version>\\S+)', prefix='hisat2 ')
SOFTWARE_VERSIONS['IDR'] = get_command_version_string('idr --version', '(?P<version>\\d+(\\.\\d+)*)', prefix='IDR ')
SOFTWARE_VERSIONS['KALLISTO'] = get_command_version_string('kallisto', '^kallisto\\s+(?P<version>\\S+)', prefix='kallisto ')
SOFTWARE_VERSIONS['MACS'] = get_command_version_string('macs2 --version', 'macs2\\s+(?P<version>\\S+)', prefix='macs2 ')
SOFTWARE_VERSIONS['SALMON'] = get_command_version_string('salmon --version', 'version\\s+:\\s+(?P<version>\\S+)', prefix='salmon ')
SOFTWARE_VERSIONS['SAMTOOLS'] = get_command_version_string('samtools', 'Version:\\s+(?P<version>\\S+)', prefix='samtools ')
SOFTWARE_VERSIONS['SRATOOLKIT'] = get_command_version_string('fastq-dump --version', ':\\s+(?P<version>\\S+)', prefix='sratoolkit ')
SOFTWARE_VERSIONS['STAR'] = get_command_version_string('STAR --version', 'STAR_(?P<version>\\S+)', prefix='STAR ')

# R, BioC, & packages
try:
    from rpy2.robjects import r
    from rpy2.rinterface import RRuntimeError
    SOFTWARE_VERSIONS['R'] = ''.join(r('R.version$version.string'))
except RRuntimeError:
    SOFTWARE_VERSIONS['R'] = None

try:
    from rpy2.robjects import r
    from rpy2.rinterface import RRuntimeError
    SOFTWARE_VERSIONS['BIOC'] = 'Bioconductor ' + ''.join(r('tryCatch(as.character(BiocManager::version()), error = function(...) BiocInstaller::biocVersion())'))
except RRuntimeError:
    SOFTWARE_VERSIONS['BIOC'] = None

def R_package_version(pkgname):
    try:
        from rpy2.robjects import r
        from rpy2.rinterface import RRuntimeError
        pkgversion = r('installed.packages()[,"Version"]').rx(pkgname)[0]
        if r['is.na'](pkgversion)[0]:
            raise ValueError("Could not determine package version for {!r}. Maybe the package is not installed?".format(pkgname))
        return ' '.join([pkgname, pkgversion])
    except Exception:
        return None