BioWorkflows: A Snakemake workflow for differential expression analysis of RNA-seq data with Kallisto and Sleuth.

__author__ = "Johannes Köster"
__copyright__ = "Copyright 2019, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

import subprocess
import sys
from snakemake.shell import shell

species = snakemake.params.species.lower()
release = int(snakemake.params.release)
fmt = snakemake.params.fmt
build = snakemake.params.build
flavor = snakemake.params.get("flavor", "")

branch = ""
if release >= 81 and build == "GRCh37":
    # use the special grch37 branch for new releases
    branch = "grch37/"

if flavor:
    flavor += "."

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

suffix = ""
if fmt == "gtf":
    suffix = "gtf.gz"
elif fmt == "gff3":
    suffix = "gff3.gz"

url = "ftp://ftp.ensembl.org/pub/{branch}release-{release}/{fmt}/{species}/{species_cap}.{build}.{release}.{flavor}{suffix}".format(
    release=release,
    build=build,
    species=species,
    fmt=fmt,
    species_cap=species.capitalize(),
    suffix=suffix,
    flavor=flavor,
    branch=branch,
)

try:
    shell("(curl -L {url} | gzip -d > {snakemake.output[0]}) {log}")
except subprocess.CalledProcessError as e:
    if snakemake.log:
        sys.stderr = open(snakemake.log[0], "a")
    print(
        "Unable to download annotation data from Ensembl. "
        "Did you check that this combination of species, build, and release is actually provided?",
        file=sys.stderr,
    )
    exit(1)

Python Snakemake From line 1 of ensembl-annotation/wrapper.py

__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


n = len(snakemake.input)
assert n == 1, "Input must contain 1 (single-end) element."

extra = snakemake.params.get("extra", "")
adapters = snakemake.params.get("adapters", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)

assert (
    extra != "" or adapters != ""
), "No options provided to cutadapt. Please use 'params: adapters=' or 'params: extra='."

shell(
    "cutadapt"
    " --cores {snakemake.threads}"
    " {adapters}"
    " {extra}"
    " -o {snakemake.output.fastq}"
    " {snakemake.input[0]}"
    " > {snakemake.output.qc} {log}"
)

Python Snakemake Cutadapt From line 3 of se/wrapper.py

__author__ = "Patrik Smeds"
__copyright__ = "Copyright 2016, Patrik Smeds"
__email__ = "[email protected]"
__license__ = "MIT"

from os.path import splitext

from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

# Check inputs/arguments.
if len(snakemake.input) == 0:
    raise ValueError("A reference genome has to be provided!")
elif len(snakemake.input) > 1:
    raise ValueError("Only one reference genome can be inputed!")

# Prefix that should be used for the database
prefix = snakemake.params.get("prefix", splitext(snakemake.output.idx[0])[0])

if len(prefix) > 0:
    prefix = "-p " + prefix

# Contrunction algorithm that will be used to build the database, default is bwtsw
construction_algorithm = snakemake.params.get("algorithm", "")

if len(construction_algorithm) != 0:
    construction_algorithm = "-a " + construction_algorithm

shell(
    "bwa index" " {prefix}" " {construction_algorithm}" " {snakemake.input[0]}" " {log}"
)

Python Snakemake BWA From line 1 of index/wrapper.py

__author__ = "Johannes Köster, Julian de Ruiter"
__copyright__ = "Copyright 2016, Johannes Köster and Julian de Ruiter"
__email__ = "[email protected], [email protected]"
__license__ = "MIT"


import tempfile
from os import path
from snakemake.shell import shell
from snakemake_wrapper_utils.java import get_java_opts
from snakemake_wrapper_utils.samtools import get_samtools_opts


# Extract arguments.
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)
sort = snakemake.params.get("sorting", "none")
sort_order = snakemake.params.get("sort_order", "coordinate")
sort_extra = snakemake.params.get("sort_extra", "")
samtools_opts = get_samtools_opts(snakemake)
java_opts = get_java_opts(snakemake)


index = snakemake.input.idx
if isinstance(index, str):
    index = path.splitext(snakemake.input.idx)[0]
else:
    index = path.splitext(snakemake.input.idx[0])[0]


# Check inputs/arguments.
if not isinstance(snakemake.input.reads, str) and len(snakemake.input.reads) not in {
    1,
    2,
}:
    raise ValueError("input must have 1 (single-end) or 2 (paired-end) elements")


if sort_order not in {"coordinate", "queryname"}:
    raise ValueError("Unexpected value for sort_order ({})".format(sort_order))


# Determine which pipe command to use for converting to bam or sorting.
if sort == "none":
    # Simply convert to bam using samtools view.
    pipe_cmd = "samtools view {samtools_opts}"

elif sort == "samtools":
    # Add name flag if needed.
    if sort_order == "queryname":
        sort_extra += " -n"

    # Sort alignments using samtools sort.
    pipe_cmd = "samtools sort {samtools_opts} {sort_extra} -T {tmpdir}"

elif sort == "picard":
    # Sort alignments using picard SortSam.
    pipe_cmd = "picard SortSam {java_opts} {sort_extra} --INPUT /dev/stdin --TMP_DIR {tmpdir} --SORT_ORDER {sort_order} --OUTPUT {snakemake.output[0]}"

else:
    raise ValueError(f"Unexpected value for params.sort ({sort})")

with tempfile.TemporaryDirectory() as tmpdir:
    shell(
        "(bwa mem"
        " -t {snakemake.threads}"
        " {extra}"
        " {index}"
        " {snakemake.input.reads}"
        " | " + pipe_cmd + ") {log}"
    )

Python Snakemake SAMtools Picard snakemake-wrapper-utils From line 1 of mem/wrapper.py

__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell

extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

# Samtools takes additional threads through its option -@
# One thread for samtools merge
# Other threads are *additional* threads passed to the '-@' argument
threads = "" if snakemake.threads <= 1 else " -@ {} ".format(snakemake.threads - 1)

shell(
    "samtools index {threads} {extra} {snakemake.input[0]} {snakemake.output[0]} {log}"
)

Python Snakemake SAMtools From line 1 of index/wrapper.py

__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


import tempfile
from pathlib import Path
from snakemake.shell import shell
from snakemake_wrapper_utils.samtools import get_samtools_opts


samtools_opts = get_samtools_opts(snakemake)
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)


with tempfile.TemporaryDirectory() as tmpdir:
    tmp_prefix = Path(tmpdir) / "samtools_fastq.sort_"

    shell(
        "samtools sort {samtools_opts} {extra} -T {tmp_prefix} {snakemake.input[0]} {log}"
    )

Python Snakemake SAMtools snakemake-wrapper-utils From line 1 of sort/wrapper.py

__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


n = len(snakemake.input)
assert n == 2, "Input must contain 2 (paired-end) elements."

extra = snakemake.params.get("extra", "")
adapters = snakemake.params.get("adapters", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)

assert (
    extra != "" or adapters != ""
), "No options provided to cutadapt. Please use 'params: adapters=' or 'params: extra='."

shell(
    "cutadapt"
    " --cores {snakemake.threads}"
    " {adapters}"
    " {extra}"
    " -o {snakemake.output.fastq1}"
    " -p {snakemake.output.fastq2}"
    " {snakemake.input}"
    " > {snakemake.output.qc} {log}"
)

Python Snakemake Cutadapt From line 3 of pe/wrapper.py

__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


n = len(snakemake.input)
assert n == 1, "Input must contain 1 (single-end) element."

extra = snakemake.params.get("extra", "")
adapters = snakemake.params.get("adapters", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)

assert (
    extra != "" or adapters != ""
), "No options provided to cutadapt. Please use 'params: adapters=' or 'params: extra='."

shell(
    "cutadapt"
    " --cores {snakemake.threads}"
    " {adapters}"
    " {extra}"
    " -o {snakemake.output.fastq}"
    " {snakemake.input[0]}"
    " > {snakemake.output.qc} {log}"
)

Python Snakemake Cutadapt From line 3 of se/wrapper.py

__author__ = "Joël Simoneau"
__copyright__ = "Copyright 2019, Joël Simoneau"
__email__ = "[email protected]"
__license__ = "MIT"

from snakemake.shell import shell

# Creating log
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

# Placeholder for optional parameters
extra = snakemake.params.get("extra", "")

# Allowing for multiple FASTA files
fasta = snakemake.input.get("fasta")
assert fasta is not None, "input-> a FASTA-file is required"
fasta = " ".join(fasta) if isinstance(fasta, list) else fasta

shell(
    "kallisto index "  # Tool
    "{extra} "  # Optional parameters
    "--index={snakemake.output.index} "  # Output file
    "{fasta} "  # Input FASTA files
    "{log}"  # Logging
)

Python Snakemake kallisto From line 3 of index/wrapper.py

__author__ = "Joël Simoneau"
__copyright__ = "Copyright 2019, Joël Simoneau"
__email__ = "[email protected]"
__license__ = "MIT"

from snakemake.shell import shell

# Creating log
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

# Placeholder for optional parameters
extra = snakemake.params.get("extra", "")

# Allowing for multiple FASTQ files
fastq = snakemake.input.get("fastq")
assert fastq is not None, "input-> a FASTQ-file is required"
fastq = " ".join(fastq) if isinstance(fastq, list) else fastq

shell(
    "kallisto quant "  # Tool
    "{extra} "  # Optional parameters
    "--threads={snakemake.threads} "  # Number of threads
    "--index={snakemake.input.index} "  # Input file
    "--output-dir={snakemake.output} "  # Output directory
    "{fastq} "  # Input FASTQ files
    "{log}"  # Logging
)

Python Snakemake Quant kallisto From line 3 of quant/wrapper.py

__author__ = "Johannes Köster"
__copyright__ = "Copyright 2019, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

import subprocess as sp
import sys
from itertools import product
from snakemake.shell import shell

species = snakemake.params.species.lower()
release = int(snakemake.params.release)
build = snakemake.params.build

branch = ""
if release >= 81 and build == "GRCh37":
    # use the special grch37 branch for new releases
    branch = "grch37/"

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

spec = ("{build}" if int(release) > 75 else "{build}.{release}").format(
    build=build, release=release
)

suffixes = ""
datatype = snakemake.params.get("datatype", "")
chromosome = snakemake.params.get("chromosome", "")
if datatype == "dna":
    if chromosome:
        suffixes = ["dna.chromosome.{}.fa.gz".format(chromosome)]
    else:
        suffixes = ["dna.primary_assembly.fa.gz", "dna.toplevel.fa.gz"]
elif datatype == "cdna":
    suffixes = ["cdna.all.fa.gz"]
elif datatype == "cds":
    suffixes = ["cds.all.fa.gz"]
elif datatype == "ncrna":
    suffixes = ["ncrna.fa.gz"]
elif datatype == "pep":
    suffixes = ["pep.all.fa.gz"]
else:
    raise ValueError("invalid datatype, must be one of dna, cdna, cds, ncrna, pep")

if chromosome:
    if not datatype == "dna":
        raise ValueError(
            "invalid datatype, to select a single chromosome the datatype must be dna"
        )

spec = spec.format(build=build, release=release)
url_prefix = f"ftp://ftp.ensembl.org/pub/{branch}release-{release}/fasta/{species}/{datatype}/{species.capitalize()}.{spec}"

success = False
for suffix in suffixes:
    url = f"{url_prefix}.{suffix}"

    try:
        shell("curl -sSf {url} > /dev/null 2> /dev/null")
    except sp.CalledProcessError:
        continue

    shell("(curl -L {url} | gzip -d > {snakemake.output[0]}) {log}")
    success = True
    break

if not success:
    if len(suffixes) > 1:
        url = f"{url_prefix}.[{'|'.join(suffixes)}]"
    else:
        url = f"{url_prefix}.{suffixes[0]}"
    print(
        f"Unable to download requested sequence data from Ensembl ({url}). "
        "Please check whether above URL is currently available (might be a temporal server issue). "
        "Apart from that, did you check that this combination of species, build, and release is actually provided?",
        file=sys.stderr,
    )
    exit(1)

Python Snakemake From line 1 of ensembl-sequence/wrapper.py

__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


n = len(snakemake.input)
assert n == 2, "Input must contain 2 (paired-end) elements."

extra = snakemake.params.get("extra", "")
adapters = snakemake.params.get("adapters", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)

assert (
    extra != "" or adapters != ""
), "No options provided to cutadapt. Please use 'params: adapters=' or 'params: extra='."

shell(
    "cutadapt"
    " --cores {snakemake.threads}"
    " {adapters}"
    " {extra}"
    " -o {snakemake.output.fastq1}"
    " -p {snakemake.output.fastq2}"
    " {snakemake.input}"
    " > {snakemake.output.qc} {log}"
)

Python Snakemake Cutadapt From line 3 of pe/wrapper.py

__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


n = len(snakemake.input)
assert n == 1, "Input must contain 1 (single-end) element."

extra = snakemake.params.get("extra", "")
adapters = snakemake.params.get("adapters", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)

assert (
    extra != "" or adapters != ""
), "No options provided to cutadapt. Please use 'params: adapters=' or 'params: extra='."

shell(
    "cutadapt"
    " --cores {snakemake.threads}"
    " {adapters}"
    " {extra}"
    " -o {snakemake.output.fastq}"
    " {snakemake.input[0]}"
    " > {snakemake.output.qc} {log}"
)

Python Snakemake Cutadapt From line 3 of se/wrapper.py

__author__ = "Johannes Köster"
__copyright__ = "Copyright 2017, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=True, stderr=True)

extra = snakemake.params.get("extra", "")

shell("datavzrd {snakemake.input.config} {extra} --output {snakemake.output[0]} {log}")

Python Snakemake From line 1 of datavzrd/wrapper.py

wrapper:
    "v2.6.0/utils/datavzrd"

SnakeMake From line 69 of rules/datavzrd.smk

wrapper:
    "v2.6.0/utils/datavzrd"

SnakeMake From line 94 of rules/datavzrd.smk

wrapper:
    "v2.6.0/utils/datavzrd"

SnakeMake From line 121 of rules/datavzrd.smk

script:
    "../scripts/compose-sample-sheet.py"

SnakeMake From line 25 of rules/diffexp.smk

script:
    "../scripts/sleuth-init.R"

SnakeMake From line 48 of rules/diffexp.smk

script:
    "../scripts/sleuth-diffexp.R"

SnakeMake From line 106 of rules/diffexp.smk

script:
    "../scripts/ihw-fdr-control.R"

SnakeMake From line 154 of rules/diffexp.smk

script:
    "../scripts/plot-bootstrap.R"

SnakeMake From line 179 of rules/diffexp.smk

script:
    "../scripts/plot-pca.R"

SnakeMake From line 209 of rules/diffexp.smk

script:
    "../scripts/plot-diffexp-pval-hist.R"

SnakeMake From line 233 of rules/diffexp.smk

script:
    "../scripts/sleuth-to-matrix.R"

SnakeMake From line 248 of rules/diffexp.smk

script:
    "../scripts/plot_diffexp_heatmap.R"

SnakeMake From line 269 of rules/diffexp.smk

script:
    "../scripts/plot-group-density.R"

SnakeMake From line 287 of rules/diffexp.smk

script:
    "../scripts/plot-scatter.R"

SnakeMake From line 307 of rules/diffexp.smk

script:
    "../scripts/plot-fld.R"

SnakeMake From line 326 of rules/diffexp.smk

script:
    "../scripts/plot-variances.R"

SnakeMake From line 347 of rules/diffexp.smk

script:
    "../scripts/vega_plot_volcano.py"

SnakeMake From line 367 of rules/diffexp.smk

script:
    "../scripts/isoform-switch-analysis-init.R"

SnakeMake From line 26 of rules/diffsplice.smk

shell:
    "pfam_scan.pl -fasta {input.fasta} -dir {params.pfam_dir} > {output} 2> {log}"

SnakeMake From line 45 of rules/diffsplice.smk

shell:
    "cpat.py -g {input.fasta} -d {input.cpat_model} -x {input.hexamers} -o {output} 2> {log}"

SnakeMake From line 60 of rules/diffsplice.smk

script:
    "../scripts/isoform-switch-analysis-annotate.R"

SnakeMake From line 101 of rules/diffsplice.smk

script:
    "../scripts/spia.R"

SnakeMake From line 28 of rules/enrichment.smk

script:
    "../scripts/fgsea.R"

SnakeMake From line 83 of rules/enrichment.smk

script:
    "../scripts/plot-fgsea-gene-sets.R"

SnakeMake From line 109 of rules/enrichment.smk

script:
    "../scripts/ens_gene_to_go.R"

SnakeMake From line 126 of rules/enrichment.smk

shell:
    "( curl --silent -o {output} {params.download} ) 2> {log}"

SnakeMake From line 139 of rules/enrichment.smk

script:
    "../scripts/goatools-go-enrichment-analysis.py"

SnakeMake From line 164 of rules/enrichment.smk

script:
    "../scripts/get-sample-hist-bins.py"

SnakeMake From line 21 of rules/qc_3prime.smk

script:
    "../scripts/plot-3prime-qc-histogram.py"

SnakeMake From line 73 of rules/qc_3prime.smk

script:
    "../scripts/plot-3prime-qc-histogram.py"

SnakeMake From line 114 of rules/qc_3prime.smk

shell:
    "samtools view {input.bam_file}/pseudoalignments.bam | cut -f1,3,4,10,11  > {output} 2> {log}"

SnakeMake SAMtools From line 10 of rules/quant_3prime.smk

wrapper:
    "v1.23.1/bio/kallisto/index"

SnakeMake From line 24 of rules/quant_3prime.smk

wrapper:
    "v1.23.1/bio/kallisto/quant"

SnakeMake Quant From line 38 of rules/quant_3prime.smk

wrapper:
    "v1.17.2/bio/bwa/index"

SnakeMake From line 51 of rules/quant_3prime.smk

wrapper:
    "v1.17.2/bio/bwa/mem"

SnakeMake From line 69 of rules/quant_3prime.smk

shell:
    "samtools view -h -F 4 {input.mapped_bam} |  cut -f1-12 | grep -f {input.canonical_ids} | samtools view -o {output.canonical_mapped_bam}  2> {log}"

SnakeMake SAMtools From line 85 of rules/quant_3prime.smk

shell:
    "samtools view {input.canonical_mapped_bam} | cut -f1,3,4,10,11  > {output}  2> {log}"

SnakeMake SAMtools From line 100 of rules/quant_3prime.smk

wrapper:
    "v1.18.3/bio/samtools/sort"

SnakeMake From line 114 of rules/quant_3prime.smk

wrapper:
    "v1.18.3/bio/samtools/index"

SnakeMake From line 130 of rules/quant_3prime.smk

script:
    "../scripts/get-3prime-max-positions.py"

SnakeMake From line 147 of rules/quant_3prime.smk

shell:
    "samtools view -R {input.canonical_mapped_3prime_pos} {input.canonical_mapped_bam} -o {output.canonical_mapped_3prime_bam}  2> {log}"

SnakeMake SAMtools From line 163 of rules/quant_3prime.smk

shell:
    "samtools bam2fq {input.canonical_3prime_mapped_bam} > {output.canonical_fastq}  2> {log}"

SnakeMake SAMtools From line 176 of rules/quant_3prime.smk

shell:
    "samtools sort {input}/pseudoalignments.bam > {output} 2> {log}"

SnakeMake SAMtools From line 189 of rules/quant_3prime.smk

wrapper:
    "v1.18.3/bio/samtools/index"

SnakeMake From line 205 of rules/quant_3prime.smk

wrapper:
    "v1.23.1/bio/kallisto/index"

SnakeMake From line 11 of rules/quant.smk

wrapper:
    "v1.23.1/bio/kallisto/quant"

SnakeMake Quant From line 26 of rules/quant.smk

script:
    "../scripts/remove_poly_tails.py"

SnakeMake From line 12 of rules/ref_3prime.smk

script:
    "../scripts/remove_strand_info.py"

SnakeMake From line 26 of rules/ref_3prime.smk

script:
    "../scripts/get_canonical_ids.R"

SnakeMake From line 38 of rules/ref_3prime.smk

shell:
    """bioawk -cfastx \
    'BEGIN{{while((getline k <"{input.canonical_ids}")>0)i[k]=1}} \
    {{if(i[$name])print ">"$name"\\n"$seq}}' \
    {input.fasta} > {output}"""

SnakeMake bioawk From line 51 of rules/ref_3prime.smk

wrapper:
    "v1.7.1/bio/reference/ensembl-sequence"

SnakeMake From line 14 of rules/ref.smk

wrapper:
    "0.80.1/bio/reference/ensembl-annotation"

SnakeMake From line 29 of rules/ref.smk

script:
    "../scripts/get-transcript-info.R"

SnakeMake From line 45 of rules/ref.smk

shell:
    "(curl -L ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/"
    "Pfam{params.release}/Pfam-A.{wildcards.ext}.gz | "
    "gzip -d > {output}) 2> {log}"

SnakeMake From line 56 of rules/ref.smk

shell:
    "hmmpress {input} > {log} 2>&1"

SnakeMake From line 72 of rules/ref.smk

shell:
    "make_hexamer_tab.py --cod={input.cds} --noncod={input.ncrna} > {output} 2> {log}"

SnakeMake From line 87 of rules/ref.smk

shell:
    "make_logitModel.py --hex={input.hexamers} --cgene={input.cds} "
    "--ngene={input.ncrna} -o {params.prefix} 2> {log}"

SnakeMake From line 105 of rules/ref.smk

script:
    "../scripts/get-spia-db.R"

SnakeMake From line 124 of rules/ref.smk

wrapper:
    "v2.6.0/bio/cutadapt/pe"

SnakeMake From line 14 of rules/trim.smk

wrapper:
    "v2.6.0/bio/cutadapt/se"

SnakeMake From line 30 of rules/trim.smk

script:
    "../scripts/get-max-read-length.py"

SnakeMake From line 43 of rules/trim.smk

import sys
sys.stderr = open(snakemake.log[0], "w")

samples_ = snakemake.params.units[["sample", "unit"]].merge(snakemake.params.samples, on="sample")
samples_["sample"] = samples_.apply(
    lambda row: "{}-{}".format(row["sample"], row["unit"]), axis=1
)
samples_["path"] = list(snakemake.input.kallisto_output)
del samples_["unit"]
samples_.to_csv(snakemake.output[0], sep="\t", index=False)

Python From line 1 of scripts/compose-sample-sheet.py

log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

library(snakemake@params[["bioc_species_pkg"]], character.only = TRUE)

# provides `tidyverse` and load_bioconductor_package()
source(snakemake@params[["common_src"]])

ens_gene_to_go <-
    AnnotationDbi::select(get(snakemake@params[["bioc_species_pkg"]]),
        keys = keys(get(snakemake@params[["bioc_species_pkg"]]),
            keytype = "ENSEMBL"
        ), columns = c("GO"),
        keytype = "ENSEMBL"
    ) %>%
    # rows with empty ENSEMBL or GO IDs are useless and will lead to trouble below
    drop_na(ENSEMBL, GO) %>%
    dplyr::rename(ensembl_gene_id = ENSEMBL) %>%
    group_by(ensembl_gene_id) %>%
    summarise(go_ids = str_c(GO, collapse = ";"))


write_tsv(ens_gene_to_go, path = snakemake@output[[1]], col_names = FALSE)

R From line 1 of scripts/ens_gene_to_go.R

log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("fgsea")
library(snakemake@params[["bioc_species_pkg"]], character.only = TRUE)

# provides library("tidyverse") and functions load_bioconductor_package() and
# get_prefix_col(), the latter requires snakemake@output[["samples"]] and
# snakemake@params[["covariate"]]
source(snakemake@params[["common_src"]])

gene_sets <- gmtPathways(snakemake@input[["gene_sets"]])
diffexp <- read_tsv(snakemake@input[["diffexp"]]) %>%
                  drop_na(ext_gene) %>%
                  mutate(ext_gene = str_to_upper(ext_gene)) %>%
                  # resolve multiple occurences of the same ext_gene, usually
                  # meaning that several ENSEMBLE genes map to the same gene
                  # symbol -- so we may loose resolution here, as long as gene
                  # symbols are used
                  group_by(ext_gene) %>%
                    filter( qval == min(qval, na.rm = TRUE) ) %>%
                    filter( pval == min(pval, na.rm = TRUE) ) %>%
                    # for the case of min(qval) == 1 and min(pval) == 1, we
                    # need something else to select a unique entry per gene
                    filter( mean_obs == max(mean_obs, na.rm = TRUE) ) %>%
                    mutate(target_id = str_c(target_id, collapse=",")) %>%
                    mutate(ens_gene = str_c(ens_gene, collapse=",")) %>%
                  distinct()

signed_pi <- get_prefix_col("signed_pi_value", colnames(diffexp))

ranked_genes <- diffexp %>%
                  dplyr::select(ext_gene, !!signed_pi) %>%
                  deframe()

# get and write out rank values that are tied -- a way to check up on respective warnings
rank_ties <- enframe(ranked_genes) %>%
               mutate(dup = duplicated(value) | duplicated(value, fromLast = TRUE) ) %>%
               filter(dup == TRUE) %>%
               dplyr::select(ext_gene = name, !!signed_pi := value)
write_tsv(rank_ties, snakemake@output[["rank_ties"]])

print(gene_sets)
print(ranked_genes)

fgsea_res <- fgsea(pathways = gene_sets,
                    stats = ranked_genes,
                    minSize=10,
                    maxSize=700,
                    nproc=snakemake@threads,
                    eps=snakemake@params[["eps"]]
                    ) %>%
                as_tibble()

# Annotation is impossible without any entries, so then just write out empty files
if ( (fgsea_res %>% count() %>% pull(n)) == 0 ) {

    # leadingEdge cannot be a list, and the empty output should at least
    # have the correct columns in the correct order
    unnested <- fgsea_res %>%
                    unnest(leadingEdge) %>%
                    add_column(
                        leading_edge_symbol = NA,
                        leading_edge_ens_gene = NA,
                        leading_edge_entrez_id = NA
                    ) %>%
                    dplyr::relocate(
                        c(
                          leading_edge_symbol,
                          leading_edge_ens_gene,
                          leading_edge_entrez_id,
                          leadingEdge
                        ),
                        .after = last_col()
                    )

    write_tsv(unnested, file = snakemake@output[["enrichment"]])
    write_tsv(unnested, file = snakemake@output[["significant"]])

} else {

    # add further annotation
    annotated <- fgsea_res %>%
                    unnest(leadingEdge) %>%
                    mutate(
                        leading_edge_symbol = str_to_title(leadingEdge),
                        leading_edge_entrez_id = mapIds(leading_edge_symbol, x=get(snakemake@params[["bioc_species_pkg"]]), keytype="SYMBOL", column="ENTREZID"),
                        leading_edge_ens_gene = mapIds(leading_edge_symbol, x=get(snakemake@params[["bioc_species_pkg"]]), keytype="SYMBOL", column="ENSEMBL")
                        ) %>%
                    group_by(pathway) %>%
                    summarise(
                        leadingEdge = str_c(leadingEdge, collapse = ','),
                        leading_edge_symbol = str_c(leading_edge_symbol, collapse = ','),
                        leading_edge_entrez_id = str_c(leading_edge_entrez_id, collapse = ','),
                        leading_edge_ens_gene = str_c(leading_edge_ens_gene, collapse = ',')
                    ) %>%
                    inner_join(fgsea_res %>% dplyr::select(-leadingEdge), by = "pathway") %>%
                    dplyr::relocate(
                        c(
                          leading_edge_symbol,
                          leading_edge_ens_gene,
                          leading_edge_entrez_id,
                          leadingEdge
                        ),
                        .after = last_col()
                    )

    # write out fgsea results for all gene sets
    write_tsv(annotated, file = snakemake@output[["enrichment"]])

    # select significant pathways
    sig_gene_sets <- annotated %>%
                       filter( padj < snakemake@params[["gene_set_fdr"]] )

    # write out fgsea results for gene sets found to be significant
    write_tsv(sig_gene_sets, file = snakemake@output[["significant"]])
}

# select significant pathways
top_pathways <- fgsea_res %>% arrange(padj) %>% head(n=1000) %>% filter(padj <= snakemake@params[["gene_set_fdr"]]) %>% arrange(-NES) %>% pull(pathway)
selected_gene_sets <- gene_sets[top_pathways]

height = .7 * (length(selected_gene_sets) + 2)

# table plot of all gene sets
tg <- plotGseaTable(
            pathways = selected_gene_sets,
            stats = ranked_genes,
            fgseaRes = fgsea_res,
            gseaParam = 1,
            render = FALSE
        )
ggsave(filename = snakemake@output[["plot"]], plot = tg, width = 15, height = height, limitsize=FALSE)

collapsed_pathways <- collapsePathways(fgsea_res %>% arrange(pval) %>% filter(padj <= snakemake@params[["gene_set_fdr"]]), gene_sets, ranked_genes)
main_pathways <- fgsea_res %>% filter(pathway %in% collapsed_pathways$mainPathways) %>% arrange(-NES) %>% pull(pathway)
selected_gene_sets <- gene_sets[main_pathways]
height = .7 * (length(selected_gene_sets) + 2)

# table plot of all gene sets
tg <- plotGseaTable(
            pathways = selected_gene_sets,
            stats = ranked_genes,
            fgseaRes = fgsea_res,
            gseaParam = 1,
            render = FALSE
        )
ggsave(filename = snakemake@output[["plot_collapsed"]], plot = tg, width = 15, height = height, limitsize=FALSE)

R tidyverse fgsea From line 1 of scripts/fgsea.R

import pandas as pd
import numpy as np
import pysam
import sys

sys.stderr = open(snakemake.log[0], "w")

bam_file = snakemake.input["canonical_mapped_bam"]
sample_name = bam_file.split(".canonical.mapped.sorted.bam")[0]
# Bam file reading
bam_file = pysam.AlignmentFile(snakemake.input["canonical_mapped_bam"], "rb")
bam_header = bam_file.header.to_dict()
trans_length_data = pd.DataFrame(bam_header.get("SQ"))
trans_length_data.rename(columns={"SN": "Transcript_ID"}, inplace=True)

# Aligned text file reading
align_bam_txt = pd.read_csv(
    snakemake.input["canonical_mapped_pos"],
    sep="\t",
    names=["read_name", "Transcript_ID", "Start", "read", "Quality"],
)
align_bam_txt["Strand"] = align_bam_txt["Transcript_ID"].str.split("_", 1).str[1]
align_bam_txt["Transcript"] = align_bam_txt["Transcript_ID"].str.split("_", 1).str[0]
merge_data = align_bam_txt.merge(trans_length_data, on="Transcript_ID")

# reads aligned to forward strand
forward_strand = merge_data.loc[merge_data["Strand"] == "1"]
forward_strand[sample_name + "_forward_strand"] = (
    forward_strand["LN"] - forward_strand["Start"]
)
aligned_reads = forward_strand.loc[
    forward_strand.groupby(["read_name", "read"])[
        sample_name + "_forward_strand"
    ].idxmin()
]

# reads aligned to reverse strand
reverse_strand = merge_data.loc[merge_data["Strand"] == "-1"]
read_min = reverse_strand.loc[
    reverse_strand.groupby(["read_name", "read"])["Start"].idxmin()
]
aligned_reads = pd.concat([aligned_reads, read_min])
aligned_reads.to_csv(
    snakemake.output["canonical_mapped_3prime_pos"],
    columns=["read_name"],
    sep="\t",
    index=False,
)

Python Pandas numpy pysam From line 1 of scripts/get-3prime-max-positions.py

log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")


library(biomaRt)
library(dplyr)
library(tidyverse)

ensembl <- useEnsembl(
    biomart = "genes", dataset = "hsapiens_gene_ensembl",
    version = snakemake@params["release"]
)
get_transcripts_ids <-
    getBM(
        attributes = c(
            "ensembl_transcript_id_version",
            "transcript_is_canonical", "transcript_mane_select", "chromosome_name"
        ),
        mart = ensembl
    )
# transcript_mane_select- manually curated primary transcript as it has been encoded in NCBI
canonical_ids <- get_transcripts_ids %>%
    select(
        ensembl_transcript_id_version, transcript_is_canonical,
        transcript_mane_select, chromosome_name
    ) %>%
    filter(!str_detect(chromosome_name, "patch|PATCH")) %>%
    filter(str_detect(transcript_mane_select, "")) %>%
    subset(transcript_is_canonical == 1)
write.csv(canonical_ids$ensembl_transcript_id_version,
    file = snakemake@output[[1]], quote = FALSE, row.names = FALSE
)

R tidyverse dplyr biomaRt From line 1 of scripts/get_canonical_ids.R

import json
import pysam

max_read_length = max(
    len(rec.sequence) for f in snakemake.input for rec in pysam.FastxFile(f)
)

with open(snakemake.output[0], "w") as out:
    json.dump(max_read_length, out)

Python JSON pysam From line 1 of scripts/get-max-read-length.py

import pandas as pd
import numpy as np
import pysam
from scipy import stats
import sys
import json
import csv


sys.stderr = open(snakemake.log[0], "w")

# Get the read-length
f = open(snakemake.input["read_length"])
read_length = json.load(f)
f.close()

# Get the transcript IDs if executing for individual transcripts or else for all transcripts
transcript_ids = snakemake.params["each_transcript"]

# Define data-frame for each strand
fwrd_allsamp_hist = pd.DataFrame([])
fwrd_allsamp_hist_trim = pd.DataFrame([])
fwrd_allsamp_hist_fil = pd.DataFrame([])
rev_allsamp_hist = pd.DataFrame([])
rev_allsamp_hist_trim = pd.DataFrame([])
rev_allsamp_hist_fil = pd.DataFrame([])

# Get the sample names
sample_name = snakemake.params["samples"]

# Bam file reading
bam_file = pysam.AlignmentFile(snakemake.input["samtools_sort"], "rb")
bam_header = bam_file.header.to_dict()
trans_length_data = pd.DataFrame(bam_header.get("SQ"))
trans_length_data.rename(columns={"SN": "Transcript_ID"}, inplace=True)

# Aligned text file reading
align_bam_txt = pd.read_csv(
    snakemake.input["aligned_file"],
    sep="\t",
    names=["read_Name", "Transcript_ID", "Start", "reads", "Quality"],
)
align_bam_txt["Strand"] = align_bam_txt["Transcript_ID"].str.split("_", 1).str[1]
align_bam_txt["Transcript"] = align_bam_txt["Transcript_ID"].str.split("_", 1).str[0]


# Both transcript len and start postion are merged based on same transcript ID
merge_data = align_bam_txt.merge(trans_length_data, on="Transcript_ID")

# Forward strand
forward_strand = merge_data.loc[merge_data["Strand"] == "1"]

# Each read postion is calcuated
forward_strand[sample_name + "_forward_strand"] = (
    forward_strand["LN"] - forward_strand["Start"]
)
aligned_reads = forward_strand.loc[
    forward_strand.groupby(["read_Name", "reads"])[
        sample_name + "_forward_strand"
    ].idxmin()
]
if aligned_reads["Transcript"].str.contains(transcript_ids).any():
    # Get aligned read postion of the given transcript
    fwrd_filtered_transcript_data = aligned_reads.query("Transcript == @transcript_ids")
    Freq_fwrd_fil, bins_fwrd_fil = np.histogram(
        fwrd_filtered_transcript_data[sample_name + "_forward_strand"],
        bins=read_length,
        range=[0, max(fwrd_filtered_transcript_data["LN"])],
    )
    hist_fwrd_fil = pd.DataFrame(
        {
            "sample_Name": sample_name,
            "Freq_forward": Freq_fwrd_fil,
            "bins_foward": bins_fwrd_fil[:-1],
        }
    )
    fwrd_allsamp_hist_fil = pd.concat([fwrd_allsamp_hist_fil, hist_fwrd_fil])
elif transcript_ids == "all":
    # Values added to corresponding bins
    Freq_fwrd, bins_fwrd = np.histogram(
        aligned_reads[sample_name + "_forward_strand"],
        bins=read_length,
        range=[0, max(aligned_reads["LN"])],
    )
    Freq_fwrd_trim, bins_fwrd_trim = np.histogram(
        forward_strand[sample_name + "_forward_strand"],
        bins=read_length,
        range=[0, 20000],
    )
    # Dataframe created for bins
    hist_fwrd = pd.DataFrame(
        {
            "sample_Name": sample_name,
            "Freq_forward": Freq_fwrd,
            "bins_foward": bins_fwrd[:-1],
        }
    )
    hist_fwrd_trim = pd.DataFrame(
        {
            "sample_Name": sample_name,
            "Freq_forward": Freq_fwrd_trim,
            "bins_foward": bins_fwrd_trim[:-1],
        }
    )
    # Each sample dataframe is concatinated
    fwrd_allsamp_hist = pd.concat([fwrd_allsamp_hist, hist_fwrd])
    fwrd_allsamp_hist_trim = pd.concat([fwrd_allsamp_hist_trim, hist_fwrd_trim])
    fwrd_allsamp_hist.to_csv(
        snakemake.output["fwrd_allsamp_hist"], sep="\t", index=False
    )
    fwrd_allsamp_hist_trim.to_csv(
        snakemake.output["fwrd_allsamp_hist_trim"], sep="\t", index=False
    )

# Reverse strand
reverse_strand = merge_data.loc[merge_data["Strand"] == "-1"]

# Minimum aligned start postion is taken
read_min = reverse_strand.loc[
    reverse_strand.groupby(["read_Name", "reads"])["Start"].idxmin()
]

if read_min["Transcript"].str.contains(transcript_ids).any():
    # Get aligned read postion of the given transcript
    rev_filtered_transcript_data = read_min.query("Transcript == @transcript_ids")
    Freq_rev_fil, bins_rev_fil = np.histogram(
        rev_filtered_transcript_data["Start"],
        bins=read_length,
        range=[0, max(rev_filtered_transcript_data["LN"])],
    )
    hist_rev_fil = pd.DataFrame(
        {
            "sample_Name": sample_name,
            "Freq_rev": Freq_rev_fil,
            "bins_rev": bins_rev_fil[:-1],
        }
    )
    rev_allsamp_hist_fil = pd.concat([rev_allsamp_hist_fil, hist_rev_fil])

elif transcript_ids == "all":
    # Values added to corresponding bins
    Freq_rev, bins_rev = np.histogram(
        read_min["Start"], bins=read_length, range=[0, max(read_min["LN"])]
    )
    Freq_rev_trim, bins_rev_trim = np.histogram(
        read_min["Start"], bins=read_length, range=[0, 20000]
    )

    # Dataframe created for bins
    hist_rev = pd.DataFrame(
        {"sample_Name": sample_name, "Freq_rev": Freq_rev, "bins_rev": bins_rev[:-1]}
    )
    hist_rev_trim = pd.DataFrame(
        {
            "sample_Name": sample_name,
            "Freq_rev": Freq_rev_trim,
            "bins_rev": bins_rev_trim[:-1],
        }
    )

    # Each sample dataframe is concatinated
    rev_allsamp_hist = pd.concat([rev_allsamp_hist, hist_rev])
    rev_allsamp_hist_trim = pd.concat([rev_allsamp_hist_trim, hist_rev_trim])
    rev_allsamp_hist.to_csv(snakemake.output["rev_allsamp_hist"], sep="\t", index=False)
    rev_allsamp_hist_trim.to_csv(
        snakemake.output["rev_allsamp_hist_trim"], sep="\t", index=False
    )


# Write bins to each file
if transcript_ids != "all":
    if fwrd_allsamp_hist_fil.empty:
        with open(snakemake.output["fwrd_allsamp_hist_fil"], "w") as fwrd_csvfile:
            print("no reads aligned", file=fwrd_csvfile)
    else:
        with open(snakemake.output["fwrd_allsamp_hist_fil"], "w") as fwrd_csvfile:
            fwrd_allsamp_hist_fil.to_csv(fwrd_csvfile, sep="\t", index=False)
    if rev_allsamp_hist_fil.empty:
        with open(snakemake.output["rev_allsamp_hist_fil"], "w") as rev_csvfile:
            print("no reads aligned", file=rev_csvfile)
    else:
        with open(snakemake.output["rev_allsamp_hist_fil"], "w") as rev_csvfile:
            rev_allsamp_hist_fil.to_csv(rev_csvfile, sep="\t", index=False)

Python Pandas numpy JSON scipy pysam From line 1 of scripts/get-sample-hist-bins.py

log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("SPIA")
library("graphite")
library(snakemake@params[["bioc_species_pkg"]], character.only = TRUE)

# provides library("tidyverse") and functions load_bioconductor_package() and
# get_prefix_col(), the latter requires snakemake@output[["samples"]] and
# snakemake@params[["covariate"]]
source(snakemake@params[["common_src"]])

pw_db <- snakemake@params[["pathway_db"]]

db <- pathways(snakemake@params[["species"]], pw_db)
db <- convertIdentifiers(db, "ENSEMBL")

saveRDS(db, snakemake@output[[1]])

R tidyverse graphite SPIA From line 1 of scripts/get-spia-db.R

 log <- file(snakemake@log[[1]], open="wt")
 sink(log)
 sink(log, type="message")

library("biomaRt")
library("tidyverse")
library("dplyr")

# this variable holds a mirror name until
# useEnsembl succeeds ("www" is last, because
# of very frequent "Internal Server Error"s)
mart <- "useast"
rounds <- 0
while (class(mart)[[1]] != "Mart") {
  mart <- tryCatch(
    {
      # done here, because error function does not
      # modify outer scope variables, I tried
      if (mart == "www") rounds <- rounds + 1
      # equivalent to useMart, but you can choose
      # the mirror instead of specifying a host
      biomaRt::useEnsembl(
        biomart = "ENSEMBL_MART_ENSEMBL",
        dataset = str_c(snakemake@params[["species"]], "_gene_ensembl"),
        version = snakemake@params[["version"]],
        mirror = mart
      )
    },
    error = function(e) {
      # change or make configurable if you want more or
      # less rounds of tries of all the mirrors
      if (rounds >= 3) {
        stop(
          str_c(
            "Have tried all 4 available Ensembl biomaRt mirrors ",
            rounds,
            " times. You might have a connection problem, or no mirror is responsive."
          )
        )
      }
      # hop to next mirror
      mart <- switch(mart,
        useast = "uswest",
        uswest = "asia",
        asia = "www",
        www = {
          # wait before starting another round through the mirrors,
          # hoping that intermittent problems disappear
          Sys.sleep(30)
          "useast"
        }
      )
    }
  )
}
three_prime_activated <- snakemake@params[["three_prime_activated"]]

attributes <- c("ensembl_transcript_id",
                "ensembl_gene_id",
                "external_gene_name",
                "description")
has_canonical <-
  "transcript_is_canonical" %in% biomaRt::listAttributes(mart = mart)$name
#Check if three_prime_activated is activated or else if transcipts are cononical
if (has_canonical && three_prime_activated) {
  attributes <- c(attributes, "transcript_is_canonical", "chromosome_name",
    "transcript_mane_select", "ensembl_transcript_id_version")
  has_mane_select <-
  "transcript_mane_select" %in% biomaRt::listAttributes(mart = mart)$name
}else if (has_canonical) {
     attributes <- c(attributes, "transcript_is_canonical")
}
t2g <- biomaRt::getBM(
attributes = attributes,
mart = mart,
useCache = FALSE
)
# Set columns as NA if three_prime_activated is set to false or if the transcipts are not canonical
if (!has_canonical || !three_prime_activated) {
  t2g <- t2g %>% add_column(chromosome_name = NA, transcript_mane_select = NA,
      ensembl_transcript_id_version = NA)
}else if (!has_canonical) {
   t2g <- t2g %>% add_column(transcript_is_canonical = NA)
}
t2g <- t2g %>%
  rename(
    target_id = ensembl_transcript_id,
    ens_gene = ensembl_gene_id,
    ext_gene = external_gene_name,
    gene_desc = description,
    canonical = transcript_is_canonical,
    chromosome_name = chromosome_name,
    transcript_mane_select = transcript_mane_select,
    ensembl_transcript_id_version = ensembl_transcript_id_version,
  ) %>%
  mutate_at(
    vars(gene_desc),
    function(values) {
      str_trim(map(values, function(v) {
        str_split(v, r"{\[}")[[1]][1]
      }))
    } # remove trailing source annotation (e.g. [Source:HGNC Symbol;Acc:HGNC:5])
  ) %>%
  mutate_at(
    vars(canonical),
    function(values) {
      as_vector(
        map(
          str_trim(values),
          function(v) {
            if (is.na(v)) {
              NA
            } else if (v == "1") {
              TRUE
            } else if (v == "0") {
              FALSE
            }
          }
        )
      )
    }
  )
# Check if 3-prime-rna-seq is activated, filter transcipts that are mane selected and filter chromosomes that are defined as "patch" 
if (three_prime_activated) {
  if (has_mane_select) {
    t2g <- t2g %>%
    filter(!str_detect(chromosome_name, "patch|PATCH")) %>%
    filter(str_detect(transcript_mane_select, ""))
  }else {
    stop(
      str_c(
        "needed mane_selected column in biomart if three prime mode is activated"
      )
    )
  }
}
write_rds(t2g, file = snakemake@output[[1]], compress = "gz")

R tidyverse dplyr biomaRt From line 1 of scripts/get-transcript-info.R

import sys

sys.stderr = open(snakemake.log[0], "w")
sys.stdout = open(snakemake.log[0], "a")

import pandas as pd
import matplotlib.pyplot as plt
from goatools.obo_parser import GODag
from goatools.anno.idtogos_reader import IdToGosReader
from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS
from goatools.godag_plot import plot_results  # , plot_goid2goobj, plot_gos
# read in directed acyclic graph of GO terms / IDs
obodag = GODag(snakemake.input.obo)


# read in mapping gene ids from input to GO terms / IDs
objanno = IdToGosReader(snakemake.input.ens_gene_to_go, godag=obodag)


# extract namespace(?) -> id2gos mapping
ns2assoc = objanno.get_ns2assc()

for nspc, id2gos in ns2assoc.items():
    print("{NS} {N:,} annotated genes".format(NS=nspc, N=len(id2gos)))

# read gene diffexp table
all_genes = pd.read_table(snakemake.input.diffexp)
# select genes significantly differentially expressed according to BH FDR of sleuth
fdr_level_gene = float(snakemake.params.gene_fdr)
sig_genes = all_genes[all_genes["qval"] < fdr_level_gene]

# initialize GOEA object
fdr_level_go_term = float(snakemake.params.go_term_fdr)
model = snakemake.params.model

goeaobj = GOEnrichmentStudyNS(
    # list of 'population' of genes looked at in total
    pop=all_genes["ens_gene"].tolist(),
    # geneid -> GO ID mapping
    ns2assoc=ns2assoc,
    # ontology DAG
    godag=obodag,
    propagate_counts=False,
    # multiple testing correction method (fdr_bh is false discovery rate control with Benjamini-Hochberg)
    methods=["fdr_bh"],
    # significance cutoff for method named above
    alpha=fdr_level_go_term,
)

goea_results_all = goeaobj.run_study(sig_genes["ens_gene"].tolist())
ensembl_id_to_symbol = dict(zip(all_genes["ens_gene"], all_genes["ext_gene"]))

go_items = [
    val
    for cat in ["BP", "CC", "MF"]
    for item, val in goeaobj.ns2objgoea[cat].assoc.items()
]

# goea_results_all.rename(columns={'# GO': 'GO'})


if goea_results_all:
    go_terms = pd.DataFrame(
        list(
            map(
                lambda x: [
                    x.GO,
                    x.goterm.name,
                    x.goterm.namespace,
                    x.p_uncorrected,
                    x.p_fdr_bh,
                    x.ratio_in_study,
                    x.ratio_in_pop,
                    x.depth,
                    x.study_count,
                    x.study_items,
                ],
                goea_results_all,
            )
        ),
        columns=[
            "GO",
            "term",
            "class",
            "p_uncorrected",
            "p_fdr_bh",
            "ratio_in_study",
            "ratio_in_pop",
            "depth",
            "study_count",
            "study_items",
        ],
    )
    go_terms["study_items"] = go_terms["study_items"].str.join(",")
    go_terms["study_items"] = go_terms.study_items.str.replace(
        "\w+(?=,|$)", lambda m: ensembl_id_to_symbol.get(m.group(0))
    )
    go_terms.to_csv(snakemake.output.enrichment, sep="\t", index=False)
else:
    # write empty table to indicate that nothing was found
    with open(snakemake.output.enrichment, "w") as out:
        print(
            "GO",
            "term",
            "class",
            "name",
            "p_uncorrected",
            "p_fdr_bh",
            "ratio_in_study",
            "ratio_in_pop",
            "depth",
            "study_count",
            "study_items",
            sep="\t",
            file=out,
        )


# plot results

# from first plot output file name, create generic file name to trigger
# separate plots for each of the gene ontology name spaces
outplot_generic = (
    snakemake.output.plot[0]
    .replace("_BP.", "_{NS}.", 1)
    .replace("_CC.", "_{NS}.", 1)
    .replace("_MF.", "_{NS}.", 1)
)

goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < fdr_level_go_term]

# https://github.com/mousepixels/sanbomics_scripts/blob/main/GO_in_python.ipynb
if goea_results_sig:
    go_sig_terms = pd.DataFrame(
        list(
            map(
                lambda x: [
                    x.GO,
                    x.goterm.name,
                    x.goterm.namespace,
                    x.p_uncorrected,
                    x.p_fdr_bh,
                    x.ratio_in_study[0],
                    x.ratio_in_study[1],
                    x.ratio_in_pop[0],
                    x.study_items,
                ],
                goea_results_sig,
            )
        ),
        columns=[
            "GO",
            "term",
            "class",
            "p-value",
            "p_corrected",
            "n_genes_diff_exp",
            "n_genes_diff_exp_study",
            "n_go_genes",
            "study_items",
        ],
    )
    go_sig_terms["gene_ratio"] = go_sig_terms.n_genes_diff_exp / go_sig_terms.n_go_genes
    go_sig_terms_sorted = go_sig_terms.sort_values(by=["class", "p_corrected"])
    go_sig_terms_sorted["study_items"] = go_sig_terms_sorted["study_items"].str.join(
        ","
    )
    go_sig_terms_sorted["study_items"] = go_sig_terms_sorted.study_items.str.replace(
        "\w+(?=,|$)", lambda m: ensembl_id_to_symbol.get(m.group(0))
    )
    # Append fold change values to gene names
    gene_to_fold_change = dict(
        zip(sig_genes["ext_gene"], sig_genes.filter(regex=("b_" + model)).iloc[:, 0])
    )
    go_sig_terms_sorted["study_items"] = go_sig_terms_sorted.study_items.astype("str")
    go_sig_terms_sorted["study_items"] = go_sig_terms_sorted.study_items.str.split(",")
    # As go terms contains 0 genes associated with terms, assign empty key to dict fold change as 0
    gene_to_fold_change[""] = 0
    gene_to_fold_change["nan"] = 0
    go_sig_terms_sorted["study_items"] = go_sig_terms_sorted.study_items.map(
        lambda x: ", ".join(
            [f"{gene_symbol}:{gene_to_fold_change[gene_symbol]}" for gene_symbol in x]
        )
    )
    go_sig_terms_sorted.to_csv(
        snakemake.output.enrichment_sig_terms, sep="\t", index=False
    )
else:
    # write empty table to indicate that nothing was found
    with open(snakemake.output.enrichment_sig_terms, "w") as out:
        print(
            "GO",
            "term",
            "class",
            "p-value",
            "p_corrected",
            "n_genes_diff_exp",
            "n_genes_diff_exp_study",
            "n_go_genes",
            "gene_ratio",
            "study_items",
            sep="\t",
            file=out,
        )
plot_results(
    outplot_generic,
    # use pvals for coloring
    goea_results=goea_results_sig,
    # print general gene symbol instead of Ensembl ID
    id2symbol=ensembl_id_to_symbol,
    # number of genes to print, or True for printing all (including count of genes)
    study_items=None,
    # number of genes to print per line
    items_p_line=6,
    # p-values to use for coloring of GO term nodes (argument name determined from code and testing against value "p_uncorrected")
    pval_name="p_fdr_bh",
)

# for all name spaces
for ns in ns2assoc.keys():
    # check if no GO terms were found to be significant
    if len([r for r in goea_results_sig if r.NS == ns]) == 0:
        fig = plt.figure(figsize=(12, 2))
        text = fig.text(
            0.5,
            0.5,
            "No plot generated, because no GO terms were found significant\n"
            "for name space {} and significance levels: genes ({}), GO terms ({}).\n"
            "You might want to check those levels and/or your intermediate data.".format(
                ns, fdr_level_gene, fdr_level_go_term
            ),
            ha="center",
            va="center",
            size=20,
        )
        fig.savefig(outplot_generic.replace("_{NS}.", "_{}.".format(ns)))

Python Pandas matplotlib goatools From line 1 of scripts/goatools-go-enrichment-analysis.py

log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("tidyverse")
library("ggpubr")
library("IHW")

number_of_groups <- 7

gene_data <- read_tsv(snakemake@input[[1]])
level_name <- snakemake@wildcards[["level"]]

# calculate covariate mean_obs for gene.aggregated data
if(level_name == "genes-aggregated") {
  gene_data <- gene_data %>%
    mutate(mean_obs = if_else(num_aggregated_transcripts > 0, sum_mean_obs_counts / num_aggregated_transcripts, 0), ens_gene = target_id)
}

# determine the appropriate number of groups for grouping in ihw calculation
tested_number_of_groups <- number_of_groups
ihw_test_grouping <- function(x){
  tryCatch(
    expr = {
      ihw(pval ~ mean_obs, data = gene_data, alpha = 0.1, nbins = x)
      return(TRUE)
    },
    error = function(e) {
      print(str_c("Number of groups was set too hight, trying grouping by ", tested_number_of_groups - 1, " groups."))
      return(FALSE)
    })
}

while(!ihw_test_grouping(tested_number_of_groups) && tested_number_of_groups > 0){
  tested_number_of_groups <- tested_number_of_groups -1
  ihw_test_grouping(tested_number_of_groups)
}

if (tested_number_of_groups < number_of_groups) {
  print(str_c("The chosen number of groups for ", level_name, " dataset was too large, instead IHW was calculated on ", tested_number_of_groups, " groups."))
  number_of_groups <- tested_number_of_groups
}

gene_data <- gene_data %>%
  drop_na(pval, mean_obs) %>%
  select(ens_gene, ext_gene, pval, mean_obs) %>%
  mutate(grouping = groups_by_filter(mean_obs, number_of_groups))

### diagnostic plots for covariate and grouping
#dispersion plot
dispersion <- ggplot(gene_data, aes(x = percent_rank(mean_obs), y = -log10(pval))) +
  geom_point() +
  ggtitle("IHW: scatter plot of p-values vs. mean of counts") +
  theme(plot.title = element_text(size = 12, face = "bold", hjust = 0.5)) +
  xlab("percent rank of mean_obs") +
  ylab(expression(-log[10](p-value)))

ggsave(snakemake@output[["dispersion"]], dispersion)

#histograms
hist_overall <- ggplot(gene_data, aes(x = pval)) +
  geom_histogram(binwidth = 0.025, boundary = 0) +
  xlab("p-values without grouping") +
  ylab("density")

hist_groups <- ggplot(gene_data, aes(x = pval)) +
  geom_histogram(binwidth = 0.025, boundary = 0) +
  xlab("p-values of the individual groups") +
  ylab("density") +
  facet_wrap(~ grouping, nrow = ceiling(number_of_groups / 4)) 

plots_agg_mean_obs <- ggarrange(hist_overall, hist_groups, nrow = 1)

histograms <- annotate_figure(plots_agg_mean_obs,
                              top = text_grob("IHW: histograms for p-values of mean of counts",
                                              color = "black",
                                              face = "bold",
                                              size = 12))

ggexport(histograms,
         filename = snakemake@output[["histograms"]],
         width=14)

# ihw calculation
ihw_results_mean <- ihw(pval ~ mean_obs, data = gene_data, alpha = 0.1, nbins = tested_number_of_groups)

# merging ens_gene-IDs and ext_gene-names
ihw_mean <- as.data.frame(ihw_results_mean) %>% 
  # TODO remove ugly hack if ihw in future allows annotation columns (feature requested here: https://support.bioconductor.org/p/129972/)
  right_join(gene_data, by = c(pvalue = "pval", covariate = "mean_obs", group = "grouping")) %>%
  unique() %>%
  select(ens_gene, ext_gene, everything())

write_tsv(ihw_mean, snakemake@output[["results"]])

### diagnostic plots for ihw calculation
# plot of trends of the covariate
plot_trend_mean <- plot(ihw_results_mean) +
  ggtitle("IHW: Plot of trends of mean of counts") +
  theme(plot.title = element_text(size=12))
ggsave(snakemake@output[["trends"]], plot_trend_mean)

# plots of decision boundaries for unweighted p-values as a function of the covariate
plot_decision_mean <- plot(ihw_results_mean, what = "decisionboundary") +
  ggtitle("IHW: Decision boundaries for unweighted p-values vs. mean of counts") +
  theme(plot.title = element_text(size=12))
ggsave(snakemake@output[["decision"]], plot_decision_mean)

# p-values vs. adusted p-values
plot_ihw_pval_mean <- ggplot(ihw_mean, aes(x = pvalue, y = adj_pvalue, col = group)) +
  geom_point(size = 0.25) +
  ggtitle("IHW: raw p-values vs. adusted p-values from ihw-analysis") +
  theme(plot.title = element_text(size=12)) +
  scale_colour_hue(l = 70, c = 150, drop = FALSE)
ggsave(snakemake@output[["adj_pvals"]], plot_ihw_pval_mean)

R ggplot2 tidyverse ggpubr IHW From line 1 of scripts/ihw-fdr-control.R

log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("tidyverse")
library("IsoformSwitchAnalyzeR")

results <- read_rds(snakemake@input[["rds"]])

results <- analyzePFAM(
    switchAnalyzeRlist = results, 
    pathToPFAMresultFile = snakemake@input[["pfam"]],
    quiet = TRUE
)

results <- analyzeCPAT(
    switchAnalyzeRlist = results,
    pathToCPATresultFile = snakemake@input[["cpat"]],
    codingCutoff = snakemake@params[["coding_cutoff"]],
    removeNoncodinORFs = snakemake@params[["remove_noncoding_orfs"]],
    quiet = TRUE
)

results <- analyzeAlternativeSplicing(
    results, 
    quiet = FALSE, 
    onlySwitchingGenes = FALSE,
)

dir.create(snakemake@output[["plots_with"]])
dir.create(snakemake@output[["plots_without"]])

if(nrow(results$isoformFeatures) > 0) {
    results <- analyzeSwitchConsequences(
        results, 
        consequencesToAnalyze = c(
            'intron_retention',
            'coding_potential',
            # 'ORF_seq_similarity', TODO this is only needed for assembly, reactivate then
            'NMD_status',
            'domains_identified'
        ),
        onlySigIsoforms = FALSE,
        removeNonConseqSwitches = FALSE,
        quiet = TRUE, # set to TRUE to circumvent a bug leading to a stop() if there are no significant switches
        alpha = snakemake@params[["fdr"]],
        dIFcutoff = snakemake@params[["min_effect_size"]],
    )

    switchPlotTopSwitches(
        switchAnalyzeRlist = results,
        n = Inf,
        filterForConsequences = FALSE,
        splitComparison = FALSE,
        splitFunctionalConsequences = TRUE,
        pathToOutput = snakemake@params[["plotdir"]],
        alpha = snakemake@params[["fdr"]],
        dIFcutoff = snakemake@params[["min_effect_size"]],
        onlySigIsoforms = FALSE,
    )
}

significant <- extractTopSwitches(
    results,
    filterForConsequences = FALSE,
    extractGenes = TRUE,
    n = Inf,
    sortByQvals = TRUE,
    alpha = snakemake@params[["fdr"]],
    dIFcutoff = snakemake@params[["min_effect_size"]],
)

write_tsv(significant, snakemake@output[["table"]])

R tidyverse IsoformSwitchAnalyzeR CPAT From line 1 of scripts/isoform-switch-analysis-annotate.R

log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("tidyverse")
library("IsoformSwitchAnalyzeR")

model <- snakemake@params[["model"]]

m <- read_rds(snakemake@input[["designmatrix"]])
is_prefix_col <- startsWith(colnames(m), model[["primary_variable"]])
colnames(m) <- replace(colnames(m), is_prefix_col, "condition")
colnames(m) <- replace(colnames(m), colnames(m) == "sample", "sampleID")
m <- m %>%
      dplyr::select("sampleID", "condition", everything())

quant <- importIsoformExpression(
   sampleVector = set_names(paste(snakemake@input[["kallisto"]], "abundance.tsv", sep="/"), snakemake@params[["samples"]]),
   addIsofomIdAsColumn = TRUE
)

candidates <- importRdata(
    isoformCountMatrix = quant$counts,
    isoformRepExpression = quant$abundance,
    designMatrix = m,
    isoformExonAnnoation = snakemake@input[["gtf"]],
    isoformNtFasta = snakemake@input[["fasta"]],
    showProgress = FALSE
)

# TODO make cutoffs configurable
filtered_candidates <- preFilter(
    switchAnalyzeRlist = candidates,
    geneExpressionCutoff = 1,
    isoformExpressionCutoff = 0,
    removeSingleIsoformGenes = TRUE
)

results <- isoformSwitchTestDEXSeq(
    switchAnalyzeRlist = filtered_candidates,
    reduceToSwitchingGenes = FALSE,
)

# get significant genes (have to do this manually, because IsoformSwitchAnalyzeR exits with an error
# in case of no significant genes).
keep <- unique(
    results$isoformFeatures$gene_ref[which(
        results$isoformFeatures$isoform_switch_q_value <
            snakemake@params[["fdr"]] &
            abs(results$isoformFeatures$dIF) > snakemake@params[["min_effect_size"]]
    )]
)

# if(length(keep) == 0) {
#     # No significant genes left, just keep the first to keep the analysis going.
#     keep <- results$isoformFeatures$gene_ref[1]
# }

# filter to significant genes
results <- subsetSwitchAnalyzeRlist(results, results$isoformFeatures$gene_ref %in% keep)

extractSequence(
    results,
    pathToOutput = snakemake@params[["seq_dir"]],
    addToSwitchAnalyzeRlist = TRUE,
    onlySwitchingGenes = FALSE,
)

write_rds(results, file = snakemake@output[[1]], compress = "gz")

R tidyverse Quant kallisto IsoformSwitchAnalyzeR From line 1 of scripts/isoform-switch-analysis-init.R

from turtle import title
import altair as alt
from altair_saver import save
import pandas as pd
import numpy as np
import pysam
from scipy import stats
import sys
import json
import glob
import os

sys.stderr = open(snakemake.log[0], "w")

# reading the file read length
f = open(snakemake.input["read_length"])
read_length = json.load(f)
f.close()

# Get the transcript IDs if executing for individual transcripts or else for all transcripts
transcript_ids = snakemake.params["each_transcript"]

# Getting the bin files
fwrd_full = []
fwrd_trim = []
fwrd_fil = []

rev_full = []
rev_trim = []
rev_fil = []

# Append all sample bin files into a single dataframe
if transcript_ids != "all":
    fwrd_allsamp_hist_fil = snakemake.input["fwrd_allsamp_hist_fil"]
    rev_allsamp_hist_fil = snakemake.input["rev_allsamp_hist_fil"]
    if fwrd_allsamp_hist_fil != "":
        for filename in fwrd_allsamp_hist_fil:
            df = pd.read_csv(filename, index_col=None, header=0, sep="\t")
            fwrd_fil.append(
                df.loc[
                    :, df.columns.isin(["sample_Name", "Freq_forward", "bins_foward"])
                ]
            )
        fwrd_allsamp_hist_fil = pd.concat(fwrd_fil, axis=0, ignore_index=True)

    if rev_allsamp_hist_fil != "":
        for filename in rev_allsamp_hist_fil:
            df = pd.read_csv(filename, index_col=None, header=0, sep="\t")
            rev_fil.append(
                df.loc[:, df.columns.isin(["sample_Name", "Freq_rev", "bins_rev"])]
            )
        rev_allsamp_hist_fil = pd.concat(rev_fil, axis=0, ignore_index=True)
else:
    file_fwrd_allsamp_hist = snakemake.input["fwrd_allsamp_hist"]
    file_fwrd_allsamp_hist_trim = snakemake.input["fwrd_allsamp_hist_trim"]
    file_rev_allsamp_hist = snakemake.input["rev_allsamp_hist"]
    file_rev_allsamp_hist_trim = snakemake.input["rev_allsamp_hist_trim"]

    for filename in file_fwrd_allsamp_hist:
        df = pd.read_csv(filename, index_col=None, header=0, sep="\t")
        fwrd_full.append(df)
    fwrd_allsamp_hist = pd.concat(fwrd_full, axis=0, ignore_index=True)

    for filename in file_fwrd_allsamp_hist_trim:
        df = pd.read_csv(filename, index_col=None, header=0, sep="\t")
        fwrd_trim.append(df)
    fwrd_allsamp_hist_trim = pd.concat(fwrd_trim, axis=0, ignore_index=True)

    for filename in file_rev_allsamp_hist:
        df = pd.read_csv(filename, index_col=None, header=0, sep="\t")
        rev_full.append(df)
    rev_allsamp_hist = pd.concat(rev_full, axis=0, ignore_index=True)

    for filename in file_rev_allsamp_hist_trim:
        df = pd.read_csv(filename, index_col=None, header=0, sep="\t")
        rev_trim.append(df)
    rev_allsamp_hist_trim = pd.concat(rev_trim, axis=0, ignore_index=True)

# Plot the qc-histogram

if transcript_ids != "all":
    if not fwrd_allsamp_hist_fil.empty:

        hist_fwrd_full = (
            alt.Chart(fwrd_allsamp_hist_fil)
            .mark_line(interpolate="step-after")
            .encode(
                x=alt.X(
                    "bins_foward",
                    title="difference between transcript length and read start",
                ),
                y=alt.Y("Freq_forward:Q", title="Count of Records"),
                color="sample_Name",
            )
            .properties(title="forward strand transcripts (full length)")
        )

        fwrd_allsamp_hist_fil["read_length"] = read_length

        cht_rd_len_fwd_full = (
            alt.Chart(fwrd_allsamp_hist_fil)
            .mark_rule(color="black", strokeDash=[3, 5])
            .encode(x="read_length")
        )

        Fwd_chart_full = hist_fwrd_full + cht_rd_len_fwd_full
        Fwd_chart_full.save(snakemake.output["full_sample_QC"])

    elif not rev_allsamp_hist_fil.empty:
        hist_rev_full = (
            alt.Chart(rev_allsamp_hist_fil)
            .mark_line(interpolate="step-after")
            .encode(
                x=alt.X("bins_rev", title="read start position in the transcript"),
                y=alt.Y("Freq_rev:Q", title="Count of Records"),
                color="sample_Name",
            )
            .properties(title="Reverse strand transcripts (full length)")
        )

        rev_allsamp_hist_fil["read_length"] = read_length

        cht_rd_len_rev = (
            alt.Chart(rev_allsamp_hist_fil)
            .mark_rule(color="black", strokeDash=[3, 5])
            .encode(x="read_length")
        )

        Rev_chart_full = hist_rev_full + cht_rd_len_rev

        Rev_chart_full.save(snakemake.output["full_sample_QC"])

    elif fwrd_allsamp_hist_fil.empty:
        # Configure empty histogram (forward strand)
        empty_histogram = alt.Chart().mark_text(text="no reads aligned", size=20)
        empty_histogram.save(snakemake.output["full_sample_QC"])
    # Configure empty histogram (reverse strand)
    elif rev_allsamp_hist_fil.empty:
        empty_histogram = alt.Chart().mark_text(text="no reads aligned", size=20)
        empty_histogram.save(snakemake.output["full_sample_QC"])


else:
    # Histogram for forward strand
    # Histogram for full len transcript
    hist_fwrd_full = (
        alt.Chart(fwrd_allsamp_hist)
        .mark_line(interpolate="step-after")
        .encode(
            x=alt.X(
                "bins_foward",
                title="difference between transcript length and read start",
            ),
            y=alt.Y("Freq_forward:Q", title="Count of Records"),
            color="sample_Name",
        )
        .properties(title="forward strand transcripts (full length)")
    )
    fwrd_allsamp_hist["read_length"] = read_length
    cht_rd_len_fwd_full = (
        alt.Chart(fwrd_allsamp_hist)
        .mark_rule(color="black", strokeDash=[3, 5])
        .encode(x="read_length")
    )
    Fwd_chart_full = hist_fwrd_full + cht_rd_len_fwd_full

    # Histogram plot for 20000 bp len transcript
    hist_fwrd_trimd = (
        alt.Chart(fwrd_allsamp_hist_trim)
        .mark_line(interpolate="step-after")
        .encode(
            x=alt.X(
                "bins_foward",
                title="difference between transcript length and read start",
            ),
            y=alt.Y("Freq_forward:Q", title="Count of Records"),
            color="sample_Name",
        )
        .properties(title="forward strand transcripts (showing 1-20000bp)")
    )
    fwrd_allsamp_hist_trim["read_length"] = read_length
    cht_rd_len_fwd_trim = (
        alt.Chart(fwrd_allsamp_hist_trim)
        .mark_rule(color="black", strokeDash=[3, 5])
        .encode(x="read_length")
    )
    Fwd_chart_trim = hist_fwrd_trimd + cht_rd_len_fwd_trim

    Fwd_chart = alt.hconcat(Fwd_chart_trim, Fwd_chart_full)

    # Histogram for reverse strand
    # Histogram for full len transcript
    hist_rev_full = (
        alt.Chart(rev_allsamp_hist)
        .mark_line(interpolate="step-after")
        .encode(
            x=alt.X("bins_rev", title="read start position in the transcript"),
            y=alt.Y("Freq_rev:Q", title="Count of Records"),
            color="sample_Name",
        )
        .properties(title="Reverse strand transcripts (full length)")
    )
    rev_allsamp_hist["read_length"] = read_length
    cht_rd_len_rev = (
        alt.Chart(rev_allsamp_hist)
        .mark_rule(color="black", strokeDash=[3, 5])
        .encode(x="read_length")
    )
    rev_chart_full = hist_rev_full + cht_rd_len_rev

    # Histogram plot for 20000 bp len transcript
    hist_rev_trim = (
        alt.Chart(rev_allsamp_hist_trim)
        .mark_line(interpolate="step-after")
        .encode(
            x=alt.X("bins_rev", title="read start position in the transcript"),
            y=alt.Y("Freq_rev:Q", title="Count of Records"),
            color="sample_Name",
        )
        .properties(title="Reverse strand transcripts (showing 1-20000bp)")
    )
    rev_allsamp_hist_trim["read_length"] = read_length
    cht_rd_len_rev_trim = (
        alt.Chart(rev_allsamp_hist_trim)
        .mark_rule(color="black", strokeDash=[3, 5])
        .encode(x="read_length")
    )
    rev_chart_trim = hist_rev_trim + cht_rd_len_rev_trim

    Rev_chart = alt.hconcat(rev_chart_trim, rev_chart_full)

    Final_chart = alt.vconcat(Fwd_chart, Rev_chart)
    Final_chart.save(snakemake.output["full_sample_QC"])

Python Pandas numpy JSON scipy pysam altair altair-saver From line 1 of scripts/plot-3prime-qc-histogram.py

log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

library("tidyverse")
library("sleuth")

so <- sleuth_load(snakemake@input[["so"]])

top_n <- -strtoi(snakemake@params["top_n"])

results <- read_tsv(snakemake@input[["transcripts"]])

top_genes <- results %>%
   filter(qval <= snakemake@params[["fdr"]]) %>%
   top_n(top_n, qval) %>%
   dplyr::select(ext_gene) %>%
   drop_na() %>%
   distinct(ext_gene)


if (snakemake@params[["genes"]]$activate == TRUE) {
    gene_table <- read.csv(snakemake@params[["genes"]]$genelist,
        sep = "\t")
    names(gene_table) <- c("ext_gene")
    genes_of_interest <- tibble(gene_table) %>%
        distinct(ext_gene)
} else {
    # "genes" is null, if the list provided in config.yaml is empty
    genes_of_interest <- tibble(ext_gene = character())
}
genes <- full_join(top_genes, genes_of_interest, by = "ext_gene") %>%
    add_row(ext_gene = "Custom") %>%
    pull(ext_gene)

dir.create(snakemake@output[[1]])

for (gene in genes) {
    transcripts <- results %>%
        filter(ext_gene == gene) %>%
        drop_na() %>%
        pull(target_id)

    if (length(transcripts > 0)) {
        for (transcript in transcripts) {
            plot_bootstrap(so, transcript, color_by = snakemake@params[["color_by"]], units = "est_counts")
            ggsave(file = str_c(snakemake@output[[1]], "/", gene, ".", transcript, ".", snakemake@wildcards[["model"]], ".bootstrap.pdf"))
        }
    }
}

R tidyverse sleuth From line 1 of scripts/plot-bootstrap.R

log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")


library(pheatmap)
library(dplyr)
library(tidyr)
# Reading the sleuth log count matrix file
sleuth_file <- read.csv(snakemake@input[["logcountmatrix_file"]],
    sep = "\t", header = TRUE
)
# Check the config file if it contains pre-defined gene list

if (snakemake@wildcards[["mode"]] == "topn") {
    # Replacing the rownames with transcript Id and gene name
    rownames(sleuth_file) <- paste(
        sleuth_file$transcript,
        ":", sleuth_file$gene
    )
    sleuth_file$gene <- NULL
    sleuth_file$transcript <- NULL
    # Getting top variable genes
    vargenes <-
        apply(sleuth_file, 1, var)
    # Selecting top 50 variable genes
    selectedgenes <- sleuth_file[row.names(sleuth_file)
    %in% names(vargenes[order(vargenes, decreasing = TRUE)][1:50]), ]
    # If the config file contains pre-defined gene list
} else if (snakemake@wildcards[["mode"]] == "predefined") {
    # Adding gene list to the variable
    predefine_genelist <-
        read.table(snakemake@input[["predef_genelist"]],
            sep = "\t"
        )
    selectedgenes <-
        sleuth_file %>% filter(sleuth_file$gene
            %in% predefine_genelist$V1)
    rownames(selectedgenes) <- paste(
        selectedgenes$transcript,
        ":", selectedgenes$gene
    )
    selectedgenes$gene <- NULL
    selectedgenes$transcript <- NULL
}
# Checks if selectedgenes variable contain genes that have expression values
if (all(selectedgenes == 0)) {
    txt <- "cannot plot, all values are zero"
    pdf(snakemake@output[["diffexp_heatmap"]], height = 10, width = 10)
    plot.new()
    text(.5, .5, txt, font = 2, cex = 1.5)
    dev.off()
    # If number of transcripts is more than 100,
    # Since for RNA-seq multiple transcripts may present for a single gene.
} else if (nrow(selectedgenes) > 100) {
    pdf_height <- round(nrow(selectedgenes) / 5)
    pdf(snakemake@output[["diffexp_heatmap"]],
        height = pdf_height, width = ncol(selectedgenes) * 2
    )
    pheatmap(selectedgenes, selectedgenes,
        cluster_rows = FALSE, display_numbers = TRUE,
        cellheight = 10, scale = "row"
    )
    dev.off()
} else {
    pdf_height <- round(nrow(selectedgenes) / 5)
    pdf(
        file = snakemake@output[["diffexp_heatmap"]],
        height = pdf_height, width = ncol(selectedgenes) * 2
    )
    pheatmap(selectedgenes, scale = "row")
    dev.off()
}

R dplyr tidyr pheatmap From line 1 of scripts/plot_diffexp_heatmap.R

log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("sleuth")
library("ggplot2")
library("tidyr")

diffexp <- sleuth_load(snakemake@input[["diffexp_rds"]]) %>% drop_na(pval)

ggplot(diffexp) + geom_histogram(aes(pval), bins = 100)
ggsave(file = snakemake@output[[1]], width = 14)

R ggplot2 tidyr sleuth From line 1 of scripts/plot-diffexp-pval-hist.R

log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("fgsea")

# provides library("tidyverse") and function get_prefix_col()
# the latter requires snakemake@output[["samples"]] and
# snakemake@params[["covariate"]]
source(snakemake@params[["common_src"]])


gene_sets <- gmtPathways(snakemake@input[["gene_sets"]])
sig_gene_sets <- read_tsv(snakemake@input[["sig_gene_sets"]])
diffexp <- read_tsv(snakemake@input[["diffexp"]]) %>%
                  drop_na(ext_gene) %>%
                  mutate(ext_gene = str_to_upper(ext_gene)) %>%
                  group_by(ext_gene) %>%
                  filter( qval == min(qval, na.rm = TRUE) ) %>%
                  mutate(target_id = str_c(target_id, collapse=",")) %>%
                  mutate(ens_gene = str_c(ens_gene, collapse=",")) %>%
                  distinct()

signed_pi <- get_prefix_col("signed_pi_value", colnames(diffexp))

ranked_genes <- diffexp %>%
                  dplyr::select(ext_gene, !!signed_pi) %>%
                  deframe()

dir.create( snakemake@output[[1]] )

for ( set in (sig_gene_sets %>% pull(pathway)) ) {
  # plot gene set enrichment
  if (length(gene_sets[[set]]) == 0 || is.na(ranked_genes[as.vector(gene_sets[[set]])])) {
    next
  }
  p <- plotEnrichment(gene_sets[[set]], ranked_genes) +
         ggtitle(str_c("gene set: ", set),
           subtitle = str_c(
             sig_gene_sets %>% filter(pathway == set) %>% pull(size)," genes;  ",
             "p-value (BH-adjusted): ", sig_gene_sets %>% filter(pathway ==  set) %>% pull(padj), "\n",
             "normalized enrichment score (NES):", sig_gene_sets %>% filter(pathway == set) %>% pull(NES)
             )
         ) +
         xlab("gene rank") +
         theme_bw( base_size = 16 )
  setname <- gsub("[-%/:,'\\. ]", "_", set)
  fname <- str_c(snakemake@wildcards[["model"]], ".", setname, ".gene-set-plot.pdf")
  ggsave(
    file = file.path( snakemake@output[[1]], fname ),
    width = 10,
    height = 7
  )
}

R tidyverse fgsea From line 1 of scripts/plot-fgsea-gene-sets.R

log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("sleuth")

so <- sleuth_load(snakemake@input[[1]])

pdf(file = snakemake@output[[1]])
plot_fld(so, paste0(snakemake@wildcards[["sample"]], "-", snakemake@wildcards[["unit"]]))
dev.off()

R sleuth From line 1 of scripts/plot-fld.R

log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("sleuth")
library("tidyverse")


so <- sleuth_load(snakemake@input[[1]])

plot_group_density(so,
                   use_filtered = TRUE,
                   units = "est_counts",
                   trans = "log",
                   grouping = setdiff(colnames(so$sample_to_covariates), "sample"),
                   offset = 1)
ggsave(snakemake@output[[1]])

R tidyverse sleuth From line 1 of scripts/plot-group-density.R

log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("sleuth")
library("ggpubr")

#principal components
pc <- 4

# plot pca
so <- sleuth_load(snakemake@input[[1]])
pc12 <- plot_pca(so, color_by = snakemake@wildcards[["covariate"]], units = "est_counts", text_labels = TRUE)
pc34 <- plot_pca(so, pc_x = 3, pc_y = 4, color_by = snakemake@wildcards[["covariate"]], units = "est_counts", text_labels = TRUE)

ggarrange(pc12, pc34, ncol = 2, common.legend = TRUE)
ggsave(snakemake@output[["pca"]], width=14)

# plot pc variance
plot_pc_variance(so, use_filtered = TRUE, units = "est_counts", pca_number = pc)
ggsave(snakemake@output[["pc_var"]], width=14)

# plot loadings
pc_loading_plots <- list()
for(i in 1:pc) {
  pc_loading_plots[[paste0("pc", i, "_loading")]] <- plot_loadings(so, 
                                                                   scale = TRUE, 
                                                                   pc_input = i,
                                                                   pc_count = 10, 
                                                                   units = "est_counts") +
    ggtitle(paste0(snakemake@wildcards[["covariate"]], ": plot loadings for principal component ", i)) +
    theme(plot.title = element_text(size = 14, face = "bold", hjust = 0.5))
}
plots_loading <- ggarrange(plotlist = pc_loading_plots, ncol = 1, nrow = 1, common.legend = TRUE)
ggexport(plots_loading, filename = snakemake@output[["loadings"]], width = 14)

R ggpubr sleuth From line 1 of scripts/plot-pca.R

log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("sleuth")
library("tidyverse")
library("ggpubr")

so <- sleuth_load(snakemake@input[[1]])
plot_list <- list()

# combinations between samples without repetition 

# sample_matrix <- t(combn(so$sample_to_covariates$sample, 2))
# combinations <- as.numeric(row(sample_matrix))
# 
# for(i in combinations) {
#   sample1 <- sample_matrix[i, 1]
#   sample2 <- sample_matrix[i, 2]
#   
#   plot_list[[i]] <- plot_scatter(so, 
#                                  sample_x = sample1, 
#                                  sample_y = sample2, 
#                                  use_filtered = TRUE, 
#                                  units = "est_counts", 
#                                  point_alpha = 0.2, 
#                                  xy_line = TRUE, 
#                                  xy_line_color = "red") +
#     labs(x = sample1, y = sample2)
#   }


# all combinations between samples (with repitition)

samples <- so$sample_to_covariates$sample
sample_matrix <- crossing(x = samples, y = samples) # creates all combinations between samples
combinations <- as.numeric(row(sample_matrix))

for(i in combinations) {
  sample1 <- sample_matrix[i, 1]
  sample2 <- sample_matrix[i, 2]
  if(sample1 != sample2) {
    plot_list[[i]] <- plot_scatter(so,
                                   sample_x = sample1,
                                   sample_y = sample2,
                                   use_filtered = TRUE,
                                   units = "est_counts",
                                   point_alpha = 0.2,
                                   xy_line = TRUE,
                                   xy_line_color = "red") +
      labs(x = sample1, y = sample2)

  } else {
    x_val <- y_val <- c(0,0)
    test <- tibble(x_val, y_val)
    plot_list[[i]] <- ggplot(test, aes(x = x_val, y = y_val)) +
      theme_void() +
      geom_text(label = "") +
      annotate("text", label = sample1, x=0, y=0, colour = "blue", fontface = "bold")
  }
}

all_plots <- ggarrange(plotlist = plot_list)
plot_figure <- annotate_figure(all_plots, 
                               top = text_grob("log transformed scatter plots of different samples",
                                               color = "black", 
                                               face = "bold", 
                                               size = 14))
ggsave(snakemake@output[[1]], plot_figure)

R ggplot2 tidyverse ggpubr sleuth From line 1 of scripts/plot-scatter.R

log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("sleuth")

sl <- snakemake@params[['sig_level']]

so <- sleuth_load(snakemake@input[[1]])

# colors from colorblind-safe Brewer palette "Dark2":
# http://colorbrewer2.org/#type=qualitative&scheme=Dark2&n=3
cb_safe_green <- "#1B9E77"
cb_safe_red <- "#D95F02"

p <- plot_vars(so,
        test = NULL,
        test_type = "lrt",
        which_model = "full",
        sig_level = sl,
        point_alpha = 0.2,
        sig_color = cb_safe_red,
        xy_line = TRUE,
        xy_line_color = cb_safe_red,
        highlight = NULL,
        highlight_color = cb_safe_green
    )
ggsave(filename = snakemake@output[[1]], width = 7, height = 7)

R sleuth From line 1 of scripts/plot-variances.R

import Bio
import sys
import re
from Bio import SeqIO

sys.stderr = open(snakemake.log[0], "w")

with open(snakemake.output[0], "w") as transcript_clean_cdna_fasta:
    for seq_record in SeqIO.parse(snakemake.input["ref_fasta"], "fasta"):
        transcript_location = seq_record.description.split(" ")[2]
        strand = transcript_location.split(":")[5]
        if strand == "1":
            polyrem_seq = re.sub("TTTT+$|AAAA+$", "", str(seq_record.seq))
            print(
                ">",
                seq_record.id,
                "_1 ",
                seq_record.description,
                sep="",
                file=transcript_clean_cdna_fasta,
            )
            print(polyrem_seq, file=transcript_clean_cdna_fasta)
        elif strand == "-1":
            polyrem_seq = re.sub("^TTTT+|^AAAA+", "", str(seq_record.seq))
            print(
                ">",
                seq_record.id,
                "_-1 ",
                seq_record.description,
                sep="",
                file=transcript_clean_cdna_fasta,
            )
            print(polyrem_seq, file=transcript_clean_cdna_fasta)

Python Biopython From line 1 of scripts/remove_poly_tails.py

import Bio
import sys
import re
from Bio import SeqIO

sys.stderr = open(snakemake.log[0], "w")

with open(snakemake.output[0], "w") as three_prime_output_file:
    for seq_record in SeqIO.parse(snakemake.input["ref_fasta"], "fasta"):
        transcript_location = seq_record.description.split(" ")[0]
        # Split the transcript id by `_` to filter the strand info from transcript id
        transcript_id = transcript_location.split("_")[0]
        print(">", transcript_id, sep="", file=three_prime_output_file)
        print(seq_record.seq, file=three_prime_output_file)

Python Biopython From line 1 of scripts/remove_strand_info.py

log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("sleuth")
library("tidyverse")
library("fs")
library("grid")
library("gridExtra")
library("dplyr")
model <- snakemake@params[["model"]]

sleuth_object <- sleuth_load(snakemake@input[[1]])

sleuth_object <- sleuth_lrt(sleuth_object, "reduced", "full")

# plot mean-variance
mean_var_plot <- plot_mean_var(sleuth_object, 
                               which_model = "full", 
                               point_alpha = 0.4,
                               point_size = 2, 
                               point_colors = c("black", "dodgerblue"),
                               smooth_alpha = 1, 
                               smooth_size = 0.75, 
                               smooth_color = "red")
ggsave(snakemake@output[["mean_var_plot"]], mean_var_plot)

write_results <- function(so, mode, output, output_all) {
    so$pval_aggregate <- FALSE
    if(mode == "aggregate") {
      # workaround the following bug-request:
      # https://github.com/pachterlab/sleuth/pull/240
      # TODO renaming can be removed when fixed
      g_col <- so$gene_column
      so$gene_column <- NULL
      so$pval_aggregate <- TRUE
      so$gene_column <- g_col
    }

    plot_model <- snakemake@wildcards[["model"]]

    # list for qq-plots to make a multipage pdf-file as output
    qq_list <- list()

    print("Performing likelihood ratio test")
    all <- sleuth_results(so, "reduced:full", "lrt", show_all = TRUE, pval_aggregate = so$pval_aggregate) %>%
            arrange(pval)
    all<- all %>% mutate_if(is.numeric, signif, 3)
    covariates <- colnames(design_matrix(so, "full"))
    covariates <- covariates[covariates != "(Intercept)"]

    # iterate over all covariates and perform wald test in order to obtain beta estimates
    if(!so$pval_aggregate) {

      # lists for volcano and ma-plots to make a multipage pdf-file as output
      volcano_list <- list()
      ma_list <- list()

      for(covariate in covariates) {
            print(str_c("Performing wald test for ", covariate))
            so <- sleuth_wt(so, covariate, "full")

            volc_plot_title <- str_c(plot_model, ": volcano plot for ", covariate)
            ma_plot_title <- str_c(plot_model, ": ma-plot for ", covariate)
            qq_plot_title <- str_c(plot_model, ": qq-plot from wald test for ", covariate)

            # volcano plot
            print(str_c("Performing volcano plot for ", covariate))
            volcano <- plot_volcano(so, covariate, "wt", "full",
                                    sig_level = snakemake@params[["sig_level_volcano"]], 
                                    point_alpha = 0.2, 
                                    sig_color = "red",
                                    highlight = NULL) +
              ggtitle(volc_plot_title) +
              theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5))
            volcano_list[[volc_plot_title]] <- volcano

            # ma-plot
            print(str_c("Performing ma-plot for ", covariate))
            ma_plot <- plot_ma(so, covariate, "wt", "full",
                               sig_level = snakemake@params[["sig_level_ma"]], 
                               point_alpha = 0.2, 
                               sig_color = "red",
                               highlight = NULL, 
                               highlight_color = "green") +
              ggtitle(ma_plot_title) +
              theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5))
            ma_list[[ma_plot_title]] <- ma_plot

            # qq-plots from wald test
            print(str_c("Performing qq-plot from wald test for ", covariate))
            qq_plot <- plot_qq(so, covariate, "wt", "full", 
                               sig_level = snakemake@params[["sig_level_qq"]],
                               point_alpha = 0.2, 
                               sig_color = "red", 
                               highlight = NULL, 
                               highlight_color = "green",
                               line_color = "blue") +
              ggtitle(qq_plot_title) +
              theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5))
            qq_list[[qq_plot_title]] <- qq_plot


            beta_col_name <- str_c("b", covariate, sep = "_")
            beta_se_col_name <- str_c(beta_col_name, "se", sep = "_")
            qval_name <- str_c("qval", covariate, sep = "_")
            all_wald <- sleuth_results(so, covariate, "wt", show_all = TRUE, pval_aggregate = FALSE) %>%
                        dplyr::select( target_id = target_id,
                                !!qval_name := qval,
                                !!beta_col_name := b,
                                !!beta_se_col_name := se_b)
            signed_pi_col_name <- str_c("signed_pi_value", covariate, sep = "_")
            all <- inner_join(all, all_wald, by = "target_id") %>%
	              # calculate a signed version of the pi-value from:
                    # https://dx.doi.org/10.1093/bioinformatics/btr671
	              # e.g. useful for GSEA ranking
	              mutate( !!signed_pi_col_name := -log10(pval) * !!sym(beta_col_name) )
      }
      # saving volcano plots
      marrange_volcano <- marrangeGrob(grobs=volcano_list, nrow=1, ncol=1, top = NULL)
      ggsave(snakemake@output[["volcano_plots"]], plot = marrange_volcano, width = 14)

      # saving ma-plots
      marrange_ma <- marrangeGrob(grobs=ma_list, nrow=1, ncol=1, top = NULL)
      ggsave(snakemake@output[["ma_plots"]], plot = marrange_ma, width = 14)
    }

    if(mode == "mostsignificant") {
        # for each gene, select the most significant transcript (this is equivalent to sleuth_gene_table, but with bug fixes)
      all <- all %>%
                drop_na(ens_gene) %>%
                group_by(ens_gene) %>%
                filter( qval == min(qval, na.rm = TRUE) ) %>%
                # ties in qval (e.g. min(qval) == 1) can lead to multiple entries per gene
                filter( pval == min(pval, na.rm = TRUE) ) %>%
                # for min(qval) == 1, then usually also min(pval) == 1, and
                # we need something else to select a unique entry per gene
                filter( mean_obs == max(mean_obs, na.rm = TRUE) ) %>%
                # sometimes we still get two transcript variants with exactly
                # the same values, i.e. they have exactly the same sequence
                # but (slightly) different annotations -- then we retain a string
                # with a comma-separated list of them
                mutate(target_id = str_c(target_id, collapse=",")) %>%
                distinct() %>%
                # useful sort for scrolling through output by increasing q-values
                arrange(qval)
      all <- all %>% mutate_if(is.numeric, signif, 3)
      # qq-plot from likelihood ratio test
      print(str_c("Performing qq-plot from likelihood ratio test"))
      qq_plot_title_trans <- str_c(plot_model, ": qq-plot from likelihood ratio test")
      qq_plot_trans <- plot_qq(so, 
                                test = 'reduced:full', 
                                test_type = 'lrt', 
                                sig_level = snakemake@params[["sig_level_qq"]],
                                point_alpha = 0.2, 
                                sig_color = "red", 
                                highlight = NULL, 
                                highlight_color = "green",
                                line_color = "blue") +
        ggtitle(qq_plot_title_trans) +
        theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5))
        qq_list[[qq_plot_title_trans]] <- qq_plot_trans
    } else if (mode == "canonical") {
      all <- all %>%
                drop_na(canonical) %>%
                filter(canonical)
      if (nrow(all) == 0) {
        stop("No canonical transcripts found (does ensembl support canonical transcript annotation for your species?")
      }
      # Control FDR again, because we have less tests now.
      all$qval <- p.adjust(all$pval, method = "BH")
      all <- all %>% mutate_if(is.numeric, signif, 3)
    } else if (mode == "custom") {
      # load custom ID list
      id_version_pattern <- "\\.\\d+$"
      ids <- read_tsv(snakemake@params[["representative_transcripts"]], col_names = "ID")$ID
      ids <- str_replace(ids, id_version_pattern, "")
      all <- all %>% 
        mutate(target_id_stem = str_replace(target_id, id_version_pattern, "")) %>% 
        filter(target_id_stem %in% ids) %>% 
        mutate(target_id_stem = NULL)
      all <- all %>% mutate_if(is.numeric, signif, 3)
      if (nrow(all) == 0) {
        stop("The given list of representative transcript ids does not match any of the transcript ids of the chosen species.")
      }
    }

    # saving qq-plots
    marrange_qq <- marrangeGrob(grobs=qq_list, nrow=1, ncol=1, top = NULL)
    ggsave(snakemake@output[["qq_plots"]], plot = marrange_qq, width = 14)

    write_rds(all, path = output_all, compress = "none")
    # add sample expressions
    all <- all %>% left_join(as_tibble(sleuth_to_matrix(so, "obs_norm", "est_counts"), rownames="target_id"))
    all <- all %>% mutate_if(is.numeric, signif, 3)
    write_tsv(all, path = output, escape = "none")
}

write_results(sleuth_object, "transcripts", snakemake@output[["transcripts"]], snakemake@output[["transcripts_rds"]])
write_results(sleuth_object, "aggregate", snakemake@output[["genes_aggregated"]], snakemake@output[["genes_aggregated_rds"]])

repr_trans <- snakemake@params[["representative_transcripts"]]
if (repr_trans == "canonical") {
  write_results(sleuth_object, "canonical", snakemake@output[["genes_representative"]], snakemake@output[["genes_representative_rds"]])
} else if (repr_trans == "mostsignificant") {
  write_results(sleuth_object, "mostsignificant", snakemake@output[["genes_representative"]], snakemake@output[["genes_representative_rds"]])
} else {
  write_results(sleuth_object, "custom", snakemake@output[["genes_representative"]], snakemake@output[["genes_representative_rds"]])
}

R tidyverse dplyr gridExtra sleuth GSEA fs From line 1 of scripts/sleuth-diffexp.R

log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("sleuth")
library("tidyverse")

model <- snakemake@params[["model"]]

samples <- read_tsv(snakemake@input[["samples"]], na = "", col_names = TRUE) %>%
            # make everything except the index, sample name and path string a factor
            mutate_at(  vars(-sample, -path),
                        list(~factor(.))
                        )

if(!is.null(snakemake@params[["exclude"]])) {
    samples <- samples %>%
                filter( !sample %in% snakemake@params[["exclude"]] )
}

samples_out <- if(!is.null(model[["full"]])) {
    # retrieve the model formula
    formula <- as.formula(model[["full"]])
    # extract variables from the formula and unnest any nested variables
    variables <- labels(terms(formula)) %>%
                    strsplit('[:*]') %>%
                    unlist()
    # remove samples with an NA value in any of the columns
    # relevant for sleuth under the current model
    samples <- samples %>%
	        drop_na(c(sample, path, all_of(variables)))

    primary_variable <- model[["primary_variable"]]
    base_level <- model[["base_level"]]
    # TODO migrate this to tidyverse
    # Ensure that primary variable factors are sorted such that base_level comes first.
    # This is important for fold changes, effect sizes to have the expected sign.
    samples[, primary_variable] <- relevel(samples[, primary_variable, drop=TRUE], base_level)

    samples %>% select(c(sample, all_of(variables)))
} else {
    samples %>% select(-path)
}

# store design matrix
write_rds(samples_out, file = snakemake@output[["designmatrix"]])

# remove all columns which have only NA values
samples <- samples %>%
	    select_if(function(col) !all(is.na(col)))

t2g <- read_rds(snakemake@input[["transcript_info"]])

so <- sleuth_prep(  samples,
                    extra_bootstrap_summary = TRUE,
                    target_mapping = t2g,
                    aggregation_column = "ens_gene",
                    read_bootstrap_tpm = TRUE,
                    transform_fun_counts = function(x) log2(x + 0.5),
                    num_cores = snakemake@threads
                    )

if ("canonical" %in% colnames(so$target_mapping)) {
    # Sleuth converts all columns to characters. We don't want that for the canonical column.
    # Hence we fix it here.
    so$target_mapping$canonical <- as.logical(so$target_mapping$canonical)
}
custom_transcripts <- so$obs_raw %>%
                        # find transcripts not in the target_mapping
                        filter(!target_id %in% so$target_mapping$target_id) %>%
                        # make it a succinct list with no repititions
                        distinct(target_id) %>%
                        # pull it out into a vector
                        pull(target_id)

if(!length(custom_transcripts) == 0) {
    so$target_mapping <- so$target_mapping %>%
                        # add those custom transcripts as rows to the target mapping
                        add_row(ens_gene = NA, ext_gene = "Custom", target_id = custom_transcripts, canonical = NA)
}

if(!is.null(model[["full"]])) {
    so <- sleuth_fit(so, as.formula(model[["full"]]), 'full')
    so <- sleuth_fit(so, as.formula(model[["reduced"]]), 'reduced')
}

sleuth_save(so, snakemake@output[["sleuth_object"]])

R tidyverse sleuth From line 1 of scripts/sleuth-init.R

log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("sleuth")
library("limma")
library("tidyverse")

so <- sleuth_load(snakemake@input[[1]])

# get normed counts
norm_counts <- sleuth_to_matrix(so, "obs_norm", "est_counts")

# log transform
log_norm_counts <- log2(norm_counts + 1)

# reorder columns to match covariates
log_norm_counts <- log_norm_counts[, so$sample_to_covariates$sample, drop=FALSE]

# obtain covariates
model <- snakemake@params[["model"]]
model_nobatch <- paste("~", model[["primary_variable"]], sep="")

covariates <- model.matrix(as.formula(model[["reduced"]]), data = so$sample_to_covariates)
design <- model.matrix(as.formula(model_nobatch), data = so$sample_to_covariates)

stopifnot(so$sample_to_covariates$sample == colnames(log_norm_counts))

# remove batch effects
final_counts <- removeBatchEffect(log_norm_counts, covariates = covariates, design = design)

target_mapping <- so$target_mapping
rownames(target_mapping) <- target_mapping$target_id

final_counts <- rownames_to_column(as.data.frame(final_counts), var="transcript") %>% 
                add_column(
                    gene = target_mapping[rownames(final_counts), "ext_gene"],
                    .after = "transcript"
                )


write_tsv(final_counts, snakemake@output[[1]])

R tidyverse sleuth limma From line 1 of scripts/sleuth-to-matrix.R

log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

library("SPIA")
library("graphite")
library(snakemake@params[["bioc_species_pkg"]], character.only = TRUE)

# provides library("tidyverse") and functions load_bioconductor_package() and
# get_prefix_col(), the latter requires snakemake@output[["samples"]] and
# snakemake@params[["covariate"]]
source(snakemake@params[["common_src"]])

pw_db <- snakemake@params[["pathway_db"]]
db <- readRDS(snakemake@input[["spia_db"]])

options(Ncpus = snakemake@threads)

diffexp <- read_tsv(snakemake@input[["diffexp"]]) %>%
    drop_na(ens_gene) %>%
    mutate(ens_gene = str_c("ENSEMBL:", ens_gene))
universe <- diffexp %>% pull(var = ens_gene)
sig_genes <- diffexp %>% filter(qval <= 0.05)
if (nrow(sig_genes) == 0) {
    cols <- c(
        "Name", "Status", "Combined FDR",
        "total perturbation accumulation", "number of genes on the pathway",
        "number of DE genes per pathway", "p-value for at least NDE genes",
        "Combined Bonferroni p-values",
        "p-value to observe a total accumulation", "Combined p-value", "Ids"
    )
    res <- data.frame(matrix(ncol = 11, nrow = 0, dimnames = list(NULL, cols)))
    # create empty perturbation plots
    pdf(file = snakemake@output[["plots"]])
    write_tsv(res, snakemake@output[["table"]])
    write_tsv(res, snakemake@output[["table_activated"]])
    write_tsv(res, snakemake@output[["table_inhibited"]])
    dev.off()
} else {
    # get logFC equivalent (the sum of beta scores of covariates of interest)

    beta_col <- get_prefix_col("b", colnames(sig_genes))

    beta <- sig_genes %>%
        dplyr::select(ens_gene, !!beta_col) %>%
        deframe()

    t <- tempdir(check = TRUE)
    olddir <- getwd()
    setwd(t)
    prepareSPIA(db, pw_db)
    res <- runSPIA(
        de = beta, all = universe, pw_db,
        plots = TRUE, verbose = TRUE
    )
    setwd(olddir)

    file.copy(
        file.path(t, "SPIAPerturbationPlots.pdf"),
        snakemake@output[["plots"]]
    )
    pathway_names <- db[res$Name]
    pathway_names <- db[res$Name]
    path_ids <- as.matrix(lapply(pathway_names@entries, slot, "id"))
    if (length(path_ids) > 0) {
        path_ids_data_frame <-
            data.frame(Ids = matrix(unlist(path_ids),
                nrow = length(path_ids), byrow = TRUE
            ))
        final_res <- cbind(res,
            Ids = path_ids_data_frame$Ids
        )
        res_reorder <- dplyr::select(
            final_res, Name, Status,
            pGFdr, tA, pSize, NDE, pNDE, pGFWER, pPERT, pG, Ids
        )
        res_reorder <- res_reorder %>%
            rename(
                "Combined Bonferroni p-values" = "pGFWER",
                "Combined FDR" = "pGFdr",
                "total perturbation accumulation" = "tA",
                "number of genes on the pathway" = "pSize",
                "number of DE genes per pathway" = "NDE",
                "Combined p-value" = "pG",
                "p-value to observe a total accumulation" = "pPERT",
                "p-value for at least NDE genes" = "pNDE"
            )
        write_tsv(res_reorder, snakemake@output[["table"]])
        sort_activated <- res_reorder[res_reorder$Status == "Activated", ]
        sort_inhibited <- res_reorder[res_reorder$Status == "Inhibited", ]
        write_tsv(sort_activated, snakemake@output[["table_activated"]])
        write_tsv(sort_inhibited, snakemake@output[["table_inhibited"]])
    } else {
        columns <- c(
            "Name", "Status", "Combined FDR", "total perturbation accumulation",
            "number of genes on the pathway", "number of DE genes per pathway",
            "p-value for at least NDE genes", "Combined Bonferroni p-values",
            "p-value to observe a total accumulation", "Combined p-value", "Ids"
        )
        emtpy_data_frame <- data.frame(matrix(nrow = 0, ncol = length(columns)))
        colnames(emtpy_data_frame) <- columns
        write_tsv(emtpy_data_frame, snakemake@output[["table"]])
        write_tsv(emtpy_data_frame, snakemake@output[["table_activated"]])
        write_tsv(emtpy_data_frame, snakemake@output[["table_inhibited"]])
    }
}

R tidyverse graphite SPIA From line 1 of scripts/spia.R

from string import Template
import pandas as pd
from io import StringIO


HTML = r"""
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <title></title>
</head>
<body>
    <div id="vis" style="width: 100%; height: 90vh;"></div>
    <script src="https://cdn.jsdelivr.net/npm/vega@5"></script>
    <script src="https://cdn.jsdelivr.net/npm/vega-lite@5"></script>
    <script src="https://cdn.jsdelivr.net/npm/vega-embed@6"></script>
    <script>
        var vega_specs = $json;
        vegaEmbed('#vis', vega_specs);
    </script>
</body>
</html>
"""


def main(snakemake):

    # read vega js file with template vars
    # `$data`, `$sig_level`, `$beta_column` and `$beta_se_column`
    with open(snakemake.input.spec, "rt") as f:
        spec = Template(f.read())

    sig_level = snakemake.params.sig_level_volcano
    primary_var = snakemake.params.primary_variable

    # find column that matches primary variable
    df: pd.DataFrame = pd.read_csv(snakemake.input.tsv, sep="\t")

    primary_cols = [
        c
        for c in list(df.columns)
        if c.startswith(f"b_{primary_var}") and not c.endswith("_se")
    ]
    if len(primary_cols) > 1:
        print("WARNING: found {len(primary_cols)} possible primary variables")
    beta_col = primary_cols[0]

    # only keep columns needed for plot
    df = df[["ens_gene", "ext_gene", "target_id", "qval", beta_col, beta_col + "_se"]]

    # nan / NA / None values do not get plotted, so remove respective rows
    df = df.dropna()

    data = StringIO()
    df.to_csv(data, sep="\t", index=False)

    # update the spec with concrete values
    json = spec.safe_substitute(
        data=data.getvalue().replace("\t", r"\t").replace("\n", r"\n"),
        sig_level=sig_level,
        beta_column=beta_col,
        beta_se_column=beta_col + "_se",
    )

    with open(snakemake.output.json, "wt") as f:
        f.write(json)



if __name__ == "__main__":
    main(snakemake)