A Snakemake workflow for differential expression analysis of RNA-seq data with Kallisto and Sleuth.

public public 1yr ago Version: v2.5.1 0 bookmarks

A Snakemake workflow for differential expression analysis of RNA-seq data with Kallisto and Sleuth .

Usage

The usage of this workflow is described in the Snakemake Workflow Catalog .

If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this (original) repository and its DOI (see above).

Code Snippets

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2019, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

import subprocess
import sys
from snakemake.shell import shell

species = snakemake.params.species.lower()
release = int(snakemake.params.release)
fmt = snakemake.params.fmt
build = snakemake.params.build
flavor = snakemake.params.get("flavor", "")

branch = ""
if release >= 81 and build == "GRCh37":
    # use the special grch37 branch for new releases
    branch = "grch37/"

if flavor:
    flavor += "."

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

suffix = ""
if fmt == "gtf":
    suffix = "gtf.gz"
elif fmt == "gff3":
    suffix = "gff3.gz"

url = "ftp://ftp.ensembl.org/pub/{branch}release-{release}/{fmt}/{species}/{species_cap}.{build}.{release}.{flavor}{suffix}".format(
    release=release,
    build=build,
    species=species,
    fmt=fmt,
    species_cap=species.capitalize(),
    suffix=suffix,
    flavor=flavor,
    branch=branch,
)

try:
    shell("(curl -L {url} | gzip -d > {snakemake.output[0]}) {log}")
except subprocess.CalledProcessError as e:
    if snakemake.log:
        sys.stderr = open(snakemake.log[0], "a")
    print(
        "Unable to download annotation data from Ensembl. "
        "Did you check that this combination of species, build, and release is actually provided?",
        file=sys.stderr,
    )
    exit(1)
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


n = len(snakemake.input)
assert n == 1, "Input must contain 1 (single-end) element."

extra = snakemake.params.get("extra", "")
adapters = snakemake.params.get("adapters", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)

assert (
    extra != "" or adapters != ""
), "No options provided to cutadapt. Please use 'params: adapters=' or 'params: extra='."

shell(
    "cutadapt"
    " --cores {snakemake.threads}"
    " {adapters}"
    " {extra}"
    " -o {snakemake.output.fastq}"
    " {snakemake.input[0]}"
    " > {snakemake.output.qc} {log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
__author__ = "Patrik Smeds"
__copyright__ = "Copyright 2016, Patrik Smeds"
__email__ = "[email protected]"
__license__ = "MIT"

from os.path import splitext

from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

# Check inputs/arguments.
if len(snakemake.input) == 0:
    raise ValueError("A reference genome has to be provided!")
elif len(snakemake.input) > 1:
    raise ValueError("Only one reference genome can be inputed!")

# Prefix that should be used for the database
prefix = snakemake.params.get("prefix", splitext(snakemake.output.idx[0])[0])

if len(prefix) > 0:
    prefix = "-p " + prefix

# Contrunction algorithm that will be used to build the database, default is bwtsw
construction_algorithm = snakemake.params.get("algorithm", "")

if len(construction_algorithm) != 0:
    construction_algorithm = "-a " + construction_algorithm

shell(
    "bwa index" " {prefix}" " {construction_algorithm}" " {snakemake.input[0]}" " {log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
__author__ = "Johannes Köster, Julian de Ruiter"
__copyright__ = "Copyright 2016, Johannes Köster and Julian de Ruiter"
__email__ = "[email protected], [email protected]"
__license__ = "MIT"


import tempfile
from os import path
from snakemake.shell import shell
from snakemake_wrapper_utils.java import get_java_opts
from snakemake_wrapper_utils.samtools import get_samtools_opts


# Extract arguments.
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)
sort = snakemake.params.get("sorting", "none")
sort_order = snakemake.params.get("sort_order", "coordinate")
sort_extra = snakemake.params.get("sort_extra", "")
samtools_opts = get_samtools_opts(snakemake)
java_opts = get_java_opts(snakemake)


index = snakemake.input.idx
if isinstance(index, str):
    index = path.splitext(snakemake.input.idx)[0]
else:
    index = path.splitext(snakemake.input.idx[0])[0]


# Check inputs/arguments.
if not isinstance(snakemake.input.reads, str) and len(snakemake.input.reads) not in {
    1,
    2,
}:
    raise ValueError("input must have 1 (single-end) or 2 (paired-end) elements")


if sort_order not in {"coordinate", "queryname"}:
    raise ValueError("Unexpected value for sort_order ({})".format(sort_order))


# Determine which pipe command to use for converting to bam or sorting.
if sort == "none":
    # Simply convert to bam using samtools view.
    pipe_cmd = "samtools view {samtools_opts}"

elif sort == "samtools":
    # Add name flag if needed.
    if sort_order == "queryname":
        sort_extra += " -n"

    # Sort alignments using samtools sort.
    pipe_cmd = "samtools sort {samtools_opts} {sort_extra} -T {tmpdir}"

elif sort == "picard":
    # Sort alignments using picard SortSam.
    pipe_cmd = "picard SortSam {java_opts} {sort_extra} --INPUT /dev/stdin --TMP_DIR {tmpdir} --SORT_ORDER {sort_order} --OUTPUT {snakemake.output[0]}"

else:
    raise ValueError(f"Unexpected value for params.sort ({sort})")

with tempfile.TemporaryDirectory() as tmpdir:
    shell(
        "(bwa mem"
        " -t {snakemake.threads}"
        " {extra}"
        " {index}"
        " {snakemake.input.reads}"
        " | " + pipe_cmd + ") {log}"
    )
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell

extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

# Samtools takes additional threads through its option -@
# One thread for samtools merge
# Other threads are *additional* threads passed to the '-@' argument
threads = "" if snakemake.threads <= 1 else " -@ {} ".format(snakemake.threads - 1)

shell(
    "samtools index {threads} {extra} {snakemake.input[0]} {snakemake.output[0]} {log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


import tempfile
from pathlib import Path
from snakemake.shell import shell
from snakemake_wrapper_utils.samtools import get_samtools_opts


samtools_opts = get_samtools_opts(snakemake)
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)


with tempfile.TemporaryDirectory() as tmpdir:
    tmp_prefix = Path(tmpdir) / "samtools_fastq.sort_"

    shell(
        "samtools sort {samtools_opts} {extra} -T {tmp_prefix} {snakemake.input[0]} {log}"
    )
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


n = len(snakemake.input)
assert n == 2, "Input must contain 2 (paired-end) elements."

extra = snakemake.params.get("extra", "")
adapters = snakemake.params.get("adapters", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)

assert (
    extra != "" or adapters != ""
), "No options provided to cutadapt. Please use 'params: adapters=' or 'params: extra='."

shell(
    "cutadapt"
    " --cores {snakemake.threads}"
    " {adapters}"
    " {extra}"
    " -o {snakemake.output.fastq1}"
    " -p {snakemake.output.fastq2}"
    " {snakemake.input}"
    " > {snakemake.output.qc} {log}"
)
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


n = len(snakemake.input)
assert n == 1, "Input must contain 1 (single-end) element."

extra = snakemake.params.get("extra", "")
adapters = snakemake.params.get("adapters", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)

assert (
    extra != "" or adapters != ""
), "No options provided to cutadapt. Please use 'params: adapters=' or 'params: extra='."

shell(
    "cutadapt"
    " --cores {snakemake.threads}"
    " {adapters}"
    " {extra}"
    " -o {snakemake.output.fastq}"
    " {snakemake.input[0]}"
    " > {snakemake.output.qc} {log}"
)
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
__author__ = "Joël Simoneau"
__copyright__ = "Copyright 2019, Joël Simoneau"
__email__ = "[email protected]"
__license__ = "MIT"

from snakemake.shell import shell

# Creating log
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

# Placeholder for optional parameters
extra = snakemake.params.get("extra", "")

# Allowing for multiple FASTA files
fasta = snakemake.input.get("fasta")
assert fasta is not None, "input-> a FASTA-file is required"
fasta = " ".join(fasta) if isinstance(fasta, list) else fasta

shell(
    "kallisto index "  # Tool
    "{extra} "  # Optional parameters
    "--index={snakemake.output.index} "  # Output file
    "{fasta} "  # Input FASTA files
    "{log}"  # Logging
)
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
__author__ = "Joël Simoneau"
__copyright__ = "Copyright 2019, Joël Simoneau"
__email__ = "[email protected]"
__license__ = "MIT"

from snakemake.shell import shell

# Creating log
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

# Placeholder for optional parameters
extra = snakemake.params.get("extra", "")

# Allowing for multiple FASTQ files
fastq = snakemake.input.get("fastq")
assert fastq is not None, "input-> a FASTQ-file is required"
fastq = " ".join(fastq) if isinstance(fastq, list) else fastq

shell(
    "kallisto quant "  # Tool
    "{extra} "  # Optional parameters
    "--threads={snakemake.threads} "  # Number of threads
    "--index={snakemake.input.index} "  # Input file
    "--output-dir={snakemake.output} "  # Output directory
    "{fastq} "  # Input FASTQ files
    "{log}"  # Logging
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2019, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

import subprocess as sp
import sys
from itertools import product
from snakemake.shell import shell

species = snakemake.params.species.lower()
release = int(snakemake.params.release)
build = snakemake.params.build

branch = ""
if release >= 81 and build == "GRCh37":
    # use the special grch37 branch for new releases
    branch = "grch37/"

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

spec = ("{build}" if int(release) > 75 else "{build}.{release}").format(
    build=build, release=release
)

suffixes = ""
datatype = snakemake.params.get("datatype", "")
chromosome = snakemake.params.get("chromosome", "")
if datatype == "dna":
    if chromosome:
        suffixes = ["dna.chromosome.{}.fa.gz".format(chromosome)]
    else:
        suffixes = ["dna.primary_assembly.fa.gz", "dna.toplevel.fa.gz"]
elif datatype == "cdna":
    suffixes = ["cdna.all.fa.gz"]
elif datatype == "cds":
    suffixes = ["cds.all.fa.gz"]
elif datatype == "ncrna":
    suffixes = ["ncrna.fa.gz"]
elif datatype == "pep":
    suffixes = ["pep.all.fa.gz"]
else:
    raise ValueError("invalid datatype, must be one of dna, cdna, cds, ncrna, pep")

if chromosome:
    if not datatype == "dna":
        raise ValueError(
            "invalid datatype, to select a single chromosome the datatype must be dna"
        )

spec = spec.format(build=build, release=release)
url_prefix = f"ftp://ftp.ensembl.org/pub/{branch}release-{release}/fasta/{species}/{datatype}/{species.capitalize()}.{spec}"

success = False
for suffix in suffixes:
    url = f"{url_prefix}.{suffix}"

    try:
        shell("curl -sSf {url} > /dev/null 2> /dev/null")
    except sp.CalledProcessError:
        continue

    shell("(curl -L {url} | gzip -d > {snakemake.output[0]}) {log}")
    success = True
    break

if not success:
    if len(suffixes) > 1:
        url = f"{url_prefix}.[{'|'.join(suffixes)}]"
    else:
        url = f"{url_prefix}.{suffixes[0]}"
    print(
        f"Unable to download requested sequence data from Ensembl ({url}). "
        "Please check whether above URL is currently available (might be a temporal server issue). "
        "Apart from that, did you check that this combination of species, build, and release is actually provided?",
        file=sys.stderr,
    )
    exit(1)
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


n = len(snakemake.input)
assert n == 2, "Input must contain 2 (paired-end) elements."

extra = snakemake.params.get("extra", "")
adapters = snakemake.params.get("adapters", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)

assert (
    extra != "" or adapters != ""
), "No options provided to cutadapt. Please use 'params: adapters=' or 'params: extra='."

shell(
    "cutadapt"
    " --cores {snakemake.threads}"
    " {adapters}"
    " {extra}"
    " -o {snakemake.output.fastq1}"
    " -p {snakemake.output.fastq2}"
    " {snakemake.input}"
    " > {snakemake.output.qc} {log}"
)
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


n = len(snakemake.input)
assert n == 1, "Input must contain 1 (single-end) element."

extra = snakemake.params.get("extra", "")
adapters = snakemake.params.get("adapters", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)

assert (
    extra != "" or adapters != ""
), "No options provided to cutadapt. Please use 'params: adapters=' or 'params: extra='."

shell(
    "cutadapt"
    " --cores {snakemake.threads}"
    " {adapters}"
    " {extra}"
    " -o {snakemake.output.fastq}"
    " {snakemake.input[0]}"
    " > {snakemake.output.qc} {log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2017, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=True, stderr=True)

extra = snakemake.params.get("extra", "")

shell("datavzrd {snakemake.input.config} {extra} --output {snakemake.output[0]} {log}")
69
70
wrapper:
    "v2.6.0/utils/datavzrd"
94
95
wrapper:
    "v2.6.0/utils/datavzrd"
121
122
wrapper:
    "v2.6.0/utils/datavzrd"
25
26
script:
    "../scripts/compose-sample-sheet.py"
48
49
script:
    "../scripts/sleuth-init.R"
106
107
script:
    "../scripts/sleuth-diffexp.R"
154
155
script:
    "../scripts/ihw-fdr-control.R"
179
180
script:
    "../scripts/plot-bootstrap.R"
209
210
script:
    "../scripts/plot-pca.R"
233
234
script:
    "../scripts/plot-diffexp-pval-hist.R"
248
249
script:
    "../scripts/sleuth-to-matrix.R"
269
270
script:
    "../scripts/plot_diffexp_heatmap.R"
287
288
script:
    "../scripts/plot-group-density.R"
307
308
script:
    "../scripts/plot-scatter.R"
326
327
script:
    "../scripts/plot-fld.R"
347
348
script:
    "../scripts/plot-variances.R"
367
368
script:
    "../scripts/vega_plot_volcano.py"
26
27
script:
    "../scripts/isoform-switch-analysis-init.R"
45
46
shell:
    "pfam_scan.pl -fasta {input.fasta} -dir {params.pfam_dir} > {output} 2> {log}"
60
61
shell:
    "cpat.py -g {input.fasta} -d {input.cpat_model} -x {input.hexamers} -o {output} 2> {log}"
101
102
script:
    "../scripts/isoform-switch-analysis-annotate.R"
28
29
script:
    "../scripts/spia.R"
83
84
script:
    "../scripts/fgsea.R"
109
110
script:
    "../scripts/plot-fgsea-gene-sets.R"
126
127
script:
    "../scripts/ens_gene_to_go.R"
139
140
shell:
    "( curl --silent -o {output} {params.download} ) 2> {log}"
164
165
script:
    "../scripts/goatools-go-enrichment-analysis.py"
21
22
script:
    "../scripts/get-sample-hist-bins.py"
73
74
script:
    "../scripts/plot-3prime-qc-histogram.py"
114
115
script:
    "../scripts/plot-3prime-qc-histogram.py"
10
11
shell:
    "samtools view {input.bam_file}/pseudoalignments.bam | cut -f1,3,4,10,11  > {output} 2> {log}"
24
25
wrapper:
    "v1.23.1/bio/kallisto/index"
38
39
wrapper:
    "v1.23.1/bio/kallisto/quant"
51
52
wrapper:
    "v1.17.2/bio/bwa/index"
69
70
wrapper:
    "v1.17.2/bio/bwa/mem"
85
86
shell:
    "samtools view -h -F 4 {input.mapped_bam} |  cut -f1-12 | grep -f {input.canonical_ids} | samtools view -o {output.canonical_mapped_bam}  2> {log}"
100
101
shell:
    "samtools view {input.canonical_mapped_bam} | cut -f1,3,4,10,11  > {output}  2> {log}"
114
115
wrapper:
    "v1.18.3/bio/samtools/sort"
130
131
wrapper:
    "v1.18.3/bio/samtools/index"
147
148
script:
    "../scripts/get-3prime-max-positions.py"
163
164
shell:
    "samtools view -R {input.canonical_mapped_3prime_pos} {input.canonical_mapped_bam} -o {output.canonical_mapped_3prime_bam}  2> {log}"
176
177
shell:
    "samtools bam2fq {input.canonical_3prime_mapped_bam} > {output.canonical_fastq}  2> {log}"
189
190
shell:
    "samtools sort {input}/pseudoalignments.bam > {output} 2> {log}"
205
206
wrapper:
    "v1.18.3/bio/samtools/index"
11
12
wrapper:
    "v1.23.1/bio/kallisto/index"
26
27
wrapper:
    "v1.23.1/bio/kallisto/quant"
12
13
script:
    "../scripts/remove_poly_tails.py"
26
27
script:
    "../scripts/remove_strand_info.py"
38
39
script:
    "../scripts/get_canonical_ids.R"
51
52
53
54
55
shell:
    """bioawk -cfastx \
    'BEGIN{{while((getline k <"{input.canonical_ids}")>0)i[k]=1}} \
    {{if(i[$name])print ">"$name"\\n"$seq}}' \
    {input.fasta} > {output}"""
14
15
wrapper:
    "v1.7.1/bio/reference/ensembl-sequence"
SnakeMake From line 14 of rules/ref.smk
29
30
wrapper:
    "0.80.1/bio/reference/ensembl-annotation"
SnakeMake From line 29 of rules/ref.smk
45
46
script:
    "../scripts/get-transcript-info.R"
SnakeMake From line 45 of rules/ref.smk
56
57
58
59
shell:
    "(curl -L ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/"
    "Pfam{params.release}/Pfam-A.{wildcards.ext}.gz | "
    "gzip -d > {output}) 2> {log}"
SnakeMake From line 56 of rules/ref.smk
72
73
shell:
    "hmmpress {input} > {log} 2>&1"
SnakeMake From line 72 of rules/ref.smk
87
88
shell:
    "make_hexamer_tab.py --cod={input.cds} --noncod={input.ncrna} > {output} 2> {log}"
SnakeMake From line 87 of rules/ref.smk
105
106
107
shell:
    "make_logitModel.py --hex={input.hexamers} --cgene={input.cds} "
    "--ngene={input.ncrna} -o {params.prefix} 2> {log}"
SnakeMake From line 105 of rules/ref.smk
124
125
script:
    "../scripts/get-spia-db.R"
SnakeMake From line 124 of rules/ref.smk
14
15
wrapper:
    "v2.6.0/bio/cutadapt/pe"
SnakeMake From line 14 of rules/trim.smk
30
31
wrapper:
    "v2.6.0/bio/cutadapt/se"
SnakeMake From line 30 of rules/trim.smk
43
44
script:
    "../scripts/get-max-read-length.py"
SnakeMake From line 43 of rules/trim.smk
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
import sys
sys.stderr = open(snakemake.log[0], "w")

samples_ = snakemake.params.units[["sample", "unit"]].merge(snakemake.params.samples, on="sample")
samples_["sample"] = samples_.apply(
    lambda row: "{}-{}".format(row["sample"], row["unit"]), axis=1
)
samples_["path"] = list(snakemake.input.kallisto_output)
del samples_["unit"]
samples_.to_csv(snakemake.output[0], sep="\t", index=False)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

library(snakemake@params[["bioc_species_pkg"]], character.only = TRUE)

# provides `tidyverse` and load_bioconductor_package()
source(snakemake@params[["common_src"]])

ens_gene_to_go <-
    AnnotationDbi::select(get(snakemake@params[["bioc_species_pkg"]]),
        keys = keys(get(snakemake@params[["bioc_species_pkg"]]),
            keytype = "ENSEMBL"
        ), columns = c("GO"),
        keytype = "ENSEMBL"
    ) %>%
    # rows with empty ENSEMBL or GO IDs are useless and will lead to trouble below
    drop_na(ENSEMBL, GO) %>%
    dplyr::rename(ensembl_gene_id = ENSEMBL) %>%
    group_by(ensembl_gene_id) %>%
    summarise(go_ids = str_c(GO, collapse = ";"))


write_tsv(ens_gene_to_go, path = snakemake@output[[1]], col_names = FALSE)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("fgsea")
library(snakemake@params[["bioc_species_pkg"]], character.only = TRUE)

# provides library("tidyverse") and functions load_bioconductor_package() and
# get_prefix_col(), the latter requires snakemake@output[["samples"]] and
# snakemake@params[["covariate"]]
source(snakemake@params[["common_src"]])

gene_sets <- gmtPathways(snakemake@input[["gene_sets"]])
diffexp <- read_tsv(snakemake@input[["diffexp"]]) %>%
                  drop_na(ext_gene) %>%
                  mutate(ext_gene = str_to_upper(ext_gene)) %>%
                  # resolve multiple occurences of the same ext_gene, usually
                  # meaning that several ENSEMBLE genes map to the same gene
                  # symbol -- so we may loose resolution here, as long as gene
                  # symbols are used
                  group_by(ext_gene) %>%
                    filter( qval == min(qval, na.rm = TRUE) ) %>%
                    filter( pval == min(pval, na.rm = TRUE) ) %>%
                    # for the case of min(qval) == 1 and min(pval) == 1, we
                    # need something else to select a unique entry per gene
                    filter( mean_obs == max(mean_obs, na.rm = TRUE) ) %>%
                    mutate(target_id = str_c(target_id, collapse=",")) %>%
                    mutate(ens_gene = str_c(ens_gene, collapse=",")) %>%
                  distinct()

signed_pi <- get_prefix_col("signed_pi_value", colnames(diffexp))

ranked_genes <- diffexp %>%
                  dplyr::select(ext_gene, !!signed_pi) %>%
                  deframe()

# get and write out rank values that are tied -- a way to check up on respective warnings
rank_ties <- enframe(ranked_genes) %>%
               mutate(dup = duplicated(value) | duplicated(value, fromLast = TRUE) ) %>%
               filter(dup == TRUE) %>%
               dplyr::select(ext_gene = name, !!signed_pi := value)
write_tsv(rank_ties, snakemake@output[["rank_ties"]])

print(gene_sets)
print(ranked_genes)

fgsea_res <- fgsea(pathways = gene_sets,
                    stats = ranked_genes,
                    minSize=10,
                    maxSize=700,
                    nproc=snakemake@threads,
                    eps=snakemake@params[["eps"]]
                    ) %>%
                as_tibble()

# Annotation is impossible without any entries, so then just write out empty files
if ( (fgsea_res %>% count() %>% pull(n)) == 0 ) {

    # leadingEdge cannot be a list, and the empty output should at least
    # have the correct columns in the correct order
    unnested <- fgsea_res %>%
                    unnest(leadingEdge) %>%
                    add_column(
                        leading_edge_symbol = NA,
                        leading_edge_ens_gene = NA,
                        leading_edge_entrez_id = NA
                    ) %>%
                    dplyr::relocate(
                        c(
                          leading_edge_symbol,
                          leading_edge_ens_gene,
                          leading_edge_entrez_id,
                          leadingEdge
                        ),
                        .after = last_col()
                    )

    write_tsv(unnested, file = snakemake@output[["enrichment"]])
    write_tsv(unnested, file = snakemake@output[["significant"]])

} else {

    # add further annotation
    annotated <- fgsea_res %>%
                    unnest(leadingEdge) %>%
                    mutate(
                        leading_edge_symbol = str_to_title(leadingEdge),
                        leading_edge_entrez_id = mapIds(leading_edge_symbol, x=get(snakemake@params[["bioc_species_pkg"]]), keytype="SYMBOL", column="ENTREZID"),
                        leading_edge_ens_gene = mapIds(leading_edge_symbol, x=get(snakemake@params[["bioc_species_pkg"]]), keytype="SYMBOL", column="ENSEMBL")
                        ) %>%
                    group_by(pathway) %>%
                    summarise(
                        leadingEdge = str_c(leadingEdge, collapse = ','),
                        leading_edge_symbol = str_c(leading_edge_symbol, collapse = ','),
                        leading_edge_entrez_id = str_c(leading_edge_entrez_id, collapse = ','),
                        leading_edge_ens_gene = str_c(leading_edge_ens_gene, collapse = ',')
                    ) %>%
                    inner_join(fgsea_res %>% dplyr::select(-leadingEdge), by = "pathway") %>%
                    dplyr::relocate(
                        c(
                          leading_edge_symbol,
                          leading_edge_ens_gene,
                          leading_edge_entrez_id,
                          leadingEdge
                        ),
                        .after = last_col()
                    )

    # write out fgsea results for all gene sets
    write_tsv(annotated, file = snakemake@output[["enrichment"]])

    # select significant pathways
    sig_gene_sets <- annotated %>%
                       filter( padj < snakemake@params[["gene_set_fdr"]] )

    # write out fgsea results for gene sets found to be significant
    write_tsv(sig_gene_sets, file = snakemake@output[["significant"]])
}

# select significant pathways
top_pathways <- fgsea_res %>% arrange(padj) %>% head(n=1000) %>% filter(padj <= snakemake@params[["gene_set_fdr"]]) %>% arrange(-NES) %>% pull(pathway)
selected_gene_sets <- gene_sets[top_pathways]

height = .7 * (length(selected_gene_sets) + 2)

# table plot of all gene sets
tg <- plotGseaTable(
            pathways = selected_gene_sets,
            stats = ranked_genes,
            fgseaRes = fgsea_res,
            gseaParam = 1,
            render = FALSE
        )
ggsave(filename = snakemake@output[["plot"]], plot = tg, width = 15, height = height, limitsize=FALSE)

collapsed_pathways <- collapsePathways(fgsea_res %>% arrange(pval) %>% filter(padj <= snakemake@params[["gene_set_fdr"]]), gene_sets, ranked_genes)
main_pathways <- fgsea_res %>% filter(pathway %in% collapsed_pathways$mainPathways) %>% arrange(-NES) %>% pull(pathway)
selected_gene_sets <- gene_sets[main_pathways]
height = .7 * (length(selected_gene_sets) + 2)

# table plot of all gene sets
tg <- plotGseaTable(
            pathways = selected_gene_sets,
            stats = ranked_genes,
            fgseaRes = fgsea_res,
            gseaParam = 1,
            render = FALSE
        )
ggsave(filename = snakemake@output[["plot_collapsed"]], plot = tg, width = 15, height = height, limitsize=FALSE)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import pandas as pd
import numpy as np
import pysam
import sys

sys.stderr = open(snakemake.log[0], "w")

bam_file = snakemake.input["canonical_mapped_bam"]
sample_name = bam_file.split(".canonical.mapped.sorted.bam")[0]
# Bam file reading
bam_file = pysam.AlignmentFile(snakemake.input["canonical_mapped_bam"], "rb")
bam_header = bam_file.header.to_dict()
trans_length_data = pd.DataFrame(bam_header.get("SQ"))
trans_length_data.rename(columns={"SN": "Transcript_ID"}, inplace=True)

# Aligned text file reading
align_bam_txt = pd.read_csv(
    snakemake.input["canonical_mapped_pos"],
    sep="\t",
    names=["read_name", "Transcript_ID", "Start", "read", "Quality"],
)
align_bam_txt["Strand"] = align_bam_txt["Transcript_ID"].str.split("_", 1).str[1]
align_bam_txt["Transcript"] = align_bam_txt["Transcript_ID"].str.split("_", 1).str[0]
merge_data = align_bam_txt.merge(trans_length_data, on="Transcript_ID")

# reads aligned to forward strand
forward_strand = merge_data.loc[merge_data["Strand"] == "1"]
forward_strand[sample_name + "_forward_strand"] = (
    forward_strand["LN"] - forward_strand["Start"]
)
aligned_reads = forward_strand.loc[
    forward_strand.groupby(["read_name", "read"])[
        sample_name + "_forward_strand"
    ].idxmin()
]

# reads aligned to reverse strand
reverse_strand = merge_data.loc[merge_data["Strand"] == "-1"]
read_min = reverse_strand.loc[
    reverse_strand.groupby(["read_name", "read"])["Start"].idxmin()
]
aligned_reads = pd.concat([aligned_reads, read_min])
aligned_reads.to_csv(
    snakemake.output["canonical_mapped_3prime_pos"],
    columns=["read_name"],
    sep="\t",
    index=False,
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")


library(biomaRt)
library(dplyr)
library(tidyverse)

ensembl <- useEnsembl(
    biomart = "genes", dataset = "hsapiens_gene_ensembl",
    version = snakemake@params["release"]
)
get_transcripts_ids <-
    getBM(
        attributes = c(
            "ensembl_transcript_id_version",
            "transcript_is_canonical", "transcript_mane_select", "chromosome_name"
        ),
        mart = ensembl
    )
# transcript_mane_select- manually curated primary transcript as it has been encoded in NCBI
canonical_ids <- get_transcripts_ids %>%
    select(
        ensembl_transcript_id_version, transcript_is_canonical,
        transcript_mane_select, chromosome_name
    ) %>%
    filter(!str_detect(chromosome_name, "patch|PATCH")) %>%
    filter(str_detect(transcript_mane_select, "")) %>%
    subset(transcript_is_canonical == 1)
write.csv(canonical_ids$ensembl_transcript_id_version,
    file = snakemake@output[[1]], quote = FALSE, row.names = FALSE
)
1
2
3
4
5
6
7
8
9
import json
import pysam

max_read_length = max(
    len(rec.sequence) for f in snakemake.input for rec in pysam.FastxFile(f)
)

with open(snakemake.output[0], "w") as out:
    json.dump(max_read_length, out)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import pandas as pd
import numpy as np
import pysam
from scipy import stats
import sys
import json
import csv


sys.stderr = open(snakemake.log[0], "w")

# Get the read-length
f = open(snakemake.input["read_length"])
read_length = json.load(f)
f.close()

# Get the transcript IDs if executing for individual transcripts or else for all transcripts
transcript_ids = snakemake.params["each_transcript"]

# Define data-frame for each strand
fwrd_allsamp_hist = pd.DataFrame([])
fwrd_allsamp_hist_trim = pd.DataFrame([])
fwrd_allsamp_hist_fil = pd.DataFrame([])
rev_allsamp_hist = pd.DataFrame([])
rev_allsamp_hist_trim = pd.DataFrame([])
rev_allsamp_hist_fil = pd.DataFrame([])

# Get the sample names
sample_name = snakemake.params["samples"]

# Bam file reading
bam_file = pysam.AlignmentFile(snakemake.input["samtools_sort"], "rb")
bam_header = bam_file.header.to_dict()
trans_length_data = pd.DataFrame(bam_header.get("SQ"))
trans_length_data.rename(columns={"SN": "Transcript_ID"}, inplace=True)

# Aligned text file reading
align_bam_txt = pd.read_csv(
    snakemake.input["aligned_file"],
    sep="\t",
    names=["read_Name", "Transcript_ID", "Start", "reads", "Quality"],
)
align_bam_txt["Strand"] = align_bam_txt["Transcript_ID"].str.split("_", 1).str[1]
align_bam_txt["Transcript"] = align_bam_txt["Transcript_ID"].str.split("_", 1).str[0]


# Both transcript len and start postion are merged based on same transcript ID
merge_data = align_bam_txt.merge(trans_length_data, on="Transcript_ID")

# Forward strand
forward_strand = merge_data.loc[merge_data["Strand"] == "1"]

# Each read postion is calcuated
forward_strand[sample_name + "_forward_strand"] = (
    forward_strand["LN"] - forward_strand["Start"]
)
aligned_reads = forward_strand.loc[
    forward_strand.groupby(["read_Name", "reads"])[
        sample_name + "_forward_strand"
    ].idxmin()
]
if aligned_reads["Transcript"].str.contains(transcript_ids).any():
    # Get aligned read postion of the given transcript
    fwrd_filtered_transcript_data = aligned_reads.query("Transcript == @transcript_ids")
    Freq_fwrd_fil, bins_fwrd_fil = np.histogram(
        fwrd_filtered_transcript_data[sample_name + "_forward_strand"],
        bins=read_length,
        range=[0, max(fwrd_filtered_transcript_data["LN"])],
    )
    hist_fwrd_fil = pd.DataFrame(
        {
            "sample_Name": sample_name,
            "Freq_forward": Freq_fwrd_fil,
            "bins_foward": bins_fwrd_fil[:-1],
        }
    )
    fwrd_allsamp_hist_fil = pd.concat([fwrd_allsamp_hist_fil, hist_fwrd_fil])
elif transcript_ids == "all":
    # Values added to corresponding bins
    Freq_fwrd, bins_fwrd = np.histogram(
        aligned_reads[sample_name + "_forward_strand"],
        bins=read_length,
        range=[0, max(aligned_reads["LN"])],
    )
    Freq_fwrd_trim, bins_fwrd_trim = np.histogram(
        forward_strand[sample_name + "_forward_strand"],
        bins=read_length,
        range=[0, 20000],
    )
    # Dataframe created for bins
    hist_fwrd = pd.DataFrame(
        {
            "sample_Name": sample_name,
            "Freq_forward": Freq_fwrd,
            "bins_foward": bins_fwrd[:-1],
        }
    )
    hist_fwrd_trim = pd.DataFrame(
        {
            "sample_Name": sample_name,
            "Freq_forward": Freq_fwrd_trim,
            "bins_foward": bins_fwrd_trim[:-1],
        }
    )
    # Each sample dataframe is concatinated
    fwrd_allsamp_hist = pd.concat([fwrd_allsamp_hist, hist_fwrd])
    fwrd_allsamp_hist_trim = pd.concat([fwrd_allsamp_hist_trim, hist_fwrd_trim])
    fwrd_allsamp_hist.to_csv(
        snakemake.output["fwrd_allsamp_hist"], sep="\t", index=False
    )
    fwrd_allsamp_hist_trim.to_csv(
        snakemake.output["fwrd_allsamp_hist_trim"], sep="\t", index=False
    )

# Reverse strand
reverse_strand = merge_data.loc[merge_data["Strand"] == "-1"]

# Minimum aligned start postion is taken
read_min = reverse_strand.loc[
    reverse_strand.groupby(["read_Name", "reads"])["Start"].idxmin()
]

if read_min["Transcript"].str.contains(transcript_ids).any():
    # Get aligned read postion of the given transcript
    rev_filtered_transcript_data = read_min.query("Transcript == @transcript_ids")
    Freq_rev_fil, bins_rev_fil = np.histogram(
        rev_filtered_transcript_data["Start"],
        bins=read_length,
        range=[0, max(rev_filtered_transcript_data["LN"])],
    )
    hist_rev_fil = pd.DataFrame(
        {
            "sample_Name": sample_name,
            "Freq_rev": Freq_rev_fil,
            "bins_rev": bins_rev_fil[:-1],
        }
    )
    rev_allsamp_hist_fil = pd.concat([rev_allsamp_hist_fil, hist_rev_fil])

elif transcript_ids == "all":
    # Values added to corresponding bins
    Freq_rev, bins_rev = np.histogram(
        read_min["Start"], bins=read_length, range=[0, max(read_min["LN"])]
    )
    Freq_rev_trim, bins_rev_trim = np.histogram(
        read_min["Start"], bins=read_length, range=[0, 20000]
    )

    # Dataframe created for bins
    hist_rev = pd.DataFrame(
        {"sample_Name": sample_name, "Freq_rev": Freq_rev, "bins_rev": bins_rev[:-1]}
    )
    hist_rev_trim = pd.DataFrame(
        {
            "sample_Name": sample_name,
            "Freq_rev": Freq_rev_trim,
            "bins_rev": bins_rev_trim[:-1],
        }
    )

    # Each sample dataframe is concatinated
    rev_allsamp_hist = pd.concat([rev_allsamp_hist, hist_rev])
    rev_allsamp_hist_trim = pd.concat([rev_allsamp_hist_trim, hist_rev_trim])
    rev_allsamp_hist.to_csv(snakemake.output["rev_allsamp_hist"], sep="\t", index=False)
    rev_allsamp_hist_trim.to_csv(
        snakemake.output["rev_allsamp_hist_trim"], sep="\t", index=False
    )


# Write bins to each file
if transcript_ids != "all":
    if fwrd_allsamp_hist_fil.empty:
        with open(snakemake.output["fwrd_allsamp_hist_fil"], "w") as fwrd_csvfile:
            print("no reads aligned", file=fwrd_csvfile)
    else:
        with open(snakemake.output["fwrd_allsamp_hist_fil"], "w") as fwrd_csvfile:
            fwrd_allsamp_hist_fil.to_csv(fwrd_csvfile, sep="\t", index=False)
    if rev_allsamp_hist_fil.empty:
        with open(snakemake.output["rev_allsamp_hist_fil"], "w") as rev_csvfile:
            print("no reads aligned", file=rev_csvfile)
    else:
        with open(snakemake.output["rev_allsamp_hist_fil"], "w") as rev_csvfile:
            rev_allsamp_hist_fil.to_csv(rev_csvfile, sep="\t", index=False)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("SPIA")
library("graphite")
library(snakemake@params[["bioc_species_pkg"]], character.only = TRUE)

# provides library("tidyverse") and functions load_bioconductor_package() and
# get_prefix_col(), the latter requires snakemake@output[["samples"]] and
# snakemake@params[["covariate"]]
source(snakemake@params[["common_src"]])

pw_db <- snakemake@params[["pathway_db"]]

db <- pathways(snakemake@params[["species"]], pw_db)
db <- convertIdentifiers(db, "ENSEMBL")

saveRDS(db, snakemake@output[[1]])
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
 log <- file(snakemake@log[[1]], open="wt")
 sink(log)
 sink(log, type="message")

library("biomaRt")
library("tidyverse")
library("dplyr")

# this variable holds a mirror name until
# useEnsembl succeeds ("www" is last, because
# of very frequent "Internal Server Error"s)
mart <- "useast"
rounds <- 0
while (class(mart)[[1]] != "Mart") {
  mart <- tryCatch(
    {
      # done here, because error function does not
      # modify outer scope variables, I tried
      if (mart == "www") rounds <- rounds + 1
      # equivalent to useMart, but you can choose
      # the mirror instead of specifying a host
      biomaRt::useEnsembl(
        biomart = "ENSEMBL_MART_ENSEMBL",
        dataset = str_c(snakemake@params[["species"]], "_gene_ensembl"),
        version = snakemake@params[["version"]],
        mirror = mart
      )
    },
    error = function(e) {
      # change or make configurable if you want more or
      # less rounds of tries of all the mirrors
      if (rounds >= 3) {
        stop(
          str_c(
            "Have tried all 4 available Ensembl biomaRt mirrors ",
            rounds,
            " times. You might have a connection problem, or no mirror is responsive."
          )
        )
      }
      # hop to next mirror
      mart <- switch(mart,
        useast = "uswest",
        uswest = "asia",
        asia = "www",
        www = {
          # wait before starting another round through the mirrors,
          # hoping that intermittent problems disappear
          Sys.sleep(30)
          "useast"
        }
      )
    }
  )
}
three_prime_activated <- snakemake@params[["three_prime_activated"]]

attributes <- c("ensembl_transcript_id",
                "ensembl_gene_id",
                "external_gene_name",
                "description")
has_canonical <-
  "transcript_is_canonical" %in% biomaRt::listAttributes(mart = mart)$name
#Check if three_prime_activated is activated or else if transcipts are cononical
if (has_canonical && three_prime_activated) {
  attributes <- c(attributes, "transcript_is_canonical", "chromosome_name",
    "transcript_mane_select", "ensembl_transcript_id_version")
  has_mane_select <-
  "transcript_mane_select" %in% biomaRt::listAttributes(mart = mart)$name
}else if (has_canonical) {
     attributes <- c(attributes, "transcript_is_canonical")
}
t2g <- biomaRt::getBM(
attributes = attributes,
mart = mart,
useCache = FALSE
)
# Set columns as NA if three_prime_activated is set to false or if the transcipts are not canonical
if (!has_canonical || !three_prime_activated) {
  t2g <- t2g %>% add_column(chromosome_name = NA, transcript_mane_select = NA,
      ensembl_transcript_id_version = NA)
}else if (!has_canonical) {
   t2g <- t2g %>% add_column(transcript_is_canonical = NA)
}
t2g <- t2g %>%
  rename(
    target_id = ensembl_transcript_id,
    ens_gene = ensembl_gene_id,
    ext_gene = external_gene_name,
    gene_desc = description,
    canonical = transcript_is_canonical,
    chromosome_name = chromosome_name,
    transcript_mane_select = transcript_mane_select,
    ensembl_transcript_id_version = ensembl_transcript_id_version,
  ) %>%
  mutate_at(
    vars(gene_desc),
    function(values) {
      str_trim(map(values, function(v) {
        str_split(v, r"{\[}")[[1]][1]
      }))
    } # remove trailing source annotation (e.g. [Source:HGNC Symbol;Acc:HGNC:5])
  ) %>%
  mutate_at(
    vars(canonical),
    function(values) {
      as_vector(
        map(
          str_trim(values),
          function(v) {
            if (is.na(v)) {
              NA
            } else if (v == "1") {
              TRUE
            } else if (v == "0") {
              FALSE
            }
          }
        )
      )
    }
  )
# Check if 3-prime-rna-seq is activated, filter transcipts that are mane selected and filter chromosomes that are defined as "patch" 
if (three_prime_activated) {
  if (has_mane_select) {
    t2g <- t2g %>%
    filter(!str_detect(chromosome_name, "patch|PATCH")) %>%
    filter(str_detect(transcript_mane_select, ""))
  }else {
    stop(
      str_c(
        "needed mane_selected column in biomart if three prime mode is activated"
      )
    )
  }
}
write_rds(t2g, file = snakemake@output[[1]], compress = "gz")
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
import sys

sys.stderr = open(snakemake.log[0], "w")
sys.stdout = open(snakemake.log[0], "a")

import pandas as pd
import matplotlib.pyplot as plt
from goatools.obo_parser import GODag
from goatools.anno.idtogos_reader import IdToGosReader
from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS
from goatools.godag_plot import plot_results  # , plot_goid2goobj, plot_gos
# read in directed acyclic graph of GO terms / IDs
obodag = GODag(snakemake.input.obo)


# read in mapping gene ids from input to GO terms / IDs
objanno = IdToGosReader(snakemake.input.ens_gene_to_go, godag=obodag)


# extract namespace(?) -> id2gos mapping
ns2assoc = objanno.get_ns2assc()

for nspc, id2gos in ns2assoc.items():
    print("{NS} {N:,} annotated genes".format(NS=nspc, N=len(id2gos)))

# read gene diffexp table
all_genes = pd.read_table(snakemake.input.diffexp)
# select genes significantly differentially expressed according to BH FDR of sleuth
fdr_level_gene = float(snakemake.params.gene_fdr)
sig_genes = all_genes[all_genes["qval"] < fdr_level_gene]

# initialize GOEA object
fdr_level_go_term = float(snakemake.params.go_term_fdr)
model = snakemake.params.model

goeaobj = GOEnrichmentStudyNS(
    # list of 'population' of genes looked at in total
    pop=all_genes["ens_gene"].tolist(),
    # geneid -> GO ID mapping
    ns2assoc=ns2assoc,
    # ontology DAG
    godag=obodag,
    propagate_counts=False,
    # multiple testing correction method (fdr_bh is false discovery rate control with Benjamini-Hochberg)
    methods=["fdr_bh"],
    # significance cutoff for method named above
    alpha=fdr_level_go_term,
)

goea_results_all = goeaobj.run_study(sig_genes["ens_gene"].tolist())
ensembl_id_to_symbol = dict(zip(all_genes["ens_gene"], all_genes["ext_gene"]))

go_items = [
    val
    for cat in ["BP", "CC", "MF"]
    for item, val in goeaobj.ns2objgoea[cat].assoc.items()
]

# goea_results_all.rename(columns={'# GO': 'GO'})


if goea_results_all:
    go_terms = pd.DataFrame(
        list(
            map(
                lambda x: [
                    x.GO,
                    x.goterm.name,
                    x.goterm.namespace,
                    x.p_uncorrected,
                    x.p_fdr_bh,
                    x.ratio_in_study,
                    x.ratio_in_pop,
                    x.depth,
                    x.study_count,
                    x.study_items,
                ],
                goea_results_all,
            )
        ),
        columns=[
            "GO",
            "term",
            "class",
            "p_uncorrected",
            "p_fdr_bh",
            "ratio_in_study",
            "ratio_in_pop",
            "depth",
            "study_count",
            "study_items",
        ],
    )
    go_terms["study_items"] = go_terms["study_items"].str.join(",")
    go_terms["study_items"] = go_terms.study_items.str.replace(
        "\w+(?=,|$)", lambda m: ensembl_id_to_symbol.get(m.group(0))
    )
    go_terms.to_csv(snakemake.output.enrichment, sep="\t", index=False)
else:
    # write empty table to indicate that nothing was found
    with open(snakemake.output.enrichment, "w") as out:
        print(
            "GO",
            "term",
            "class",
            "name",
            "p_uncorrected",
            "p_fdr_bh",
            "ratio_in_study",
            "ratio_in_pop",
            "depth",
            "study_count",
            "study_items",
            sep="\t",
            file=out,
        )


# plot results

# from first plot output file name, create generic file name to trigger
# separate plots for each of the gene ontology name spaces
outplot_generic = (
    snakemake.output.plot[0]
    .replace("_BP.", "_{NS}.", 1)
    .replace("_CC.", "_{NS}.", 1)
    .replace("_MF.", "_{NS}.", 1)
)

goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < fdr_level_go_term]

# https://github.com/mousepixels/sanbomics_scripts/blob/main/GO_in_python.ipynb
if goea_results_sig:
    go_sig_terms = pd.DataFrame(
        list(
            map(
                lambda x: [
                    x.GO,
                    x.goterm.name,
                    x.goterm.namespace,
                    x.p_uncorrected,
                    x.p_fdr_bh,
                    x.ratio_in_study[0],
                    x.ratio_in_study[1],
                    x.ratio_in_pop[0],
                    x.study_items,
                ],
                goea_results_sig,
            )
        ),
        columns=[
            "GO",
            "term",
            "class",
            "p-value",
            "p_corrected",
            "n_genes_diff_exp",
            "n_genes_diff_exp_study",
            "n_go_genes",
            "study_items",
        ],
    )
    go_sig_terms["gene_ratio"] = go_sig_terms.n_genes_diff_exp / go_sig_terms.n_go_genes
    go_sig_terms_sorted = go_sig_terms.sort_values(by=["class", "p_corrected"])
    go_sig_terms_sorted["study_items"] = go_sig_terms_sorted["study_items"].str.join(
        ","
    )
    go_sig_terms_sorted["study_items"] = go_sig_terms_sorted.study_items.str.replace(
        "\w+(?=,|$)", lambda m: ensembl_id_to_symbol.get(m.group(0))
    )
    # Append fold change values to gene names
    gene_to_fold_change = dict(
        zip(sig_genes["ext_gene"], sig_genes.filter(regex=("b_" + model)).iloc[:, 0])
    )
    go_sig_terms_sorted["study_items"] = go_sig_terms_sorted.study_items.astype("str")
    go_sig_terms_sorted["study_items"] = go_sig_terms_sorted.study_items.str.split(",")
    # As go terms contains 0 genes associated with terms, assign empty key to dict fold change as 0
    gene_to_fold_change[""] = 0
    gene_to_fold_change["nan"] = 0
    go_sig_terms_sorted["study_items"] = go_sig_terms_sorted.study_items.map(
        lambda x: ", ".join(
            [f"{gene_symbol}:{gene_to_fold_change[gene_symbol]}" for gene_symbol in x]
        )
    )
    go_sig_terms_sorted.to_csv(
        snakemake.output.enrichment_sig_terms, sep="\t", index=False
    )
else:
    # write empty table to indicate that nothing was found
    with open(snakemake.output.enrichment_sig_terms, "w") as out:
        print(
            "GO",
            "term",
            "class",
            "p-value",
            "p_corrected",
            "n_genes_diff_exp",
            "n_genes_diff_exp_study",
            "n_go_genes",
            "gene_ratio",
            "study_items",
            sep="\t",
            file=out,
        )
plot_results(
    outplot_generic,
    # use pvals for coloring
    goea_results=goea_results_sig,
    # print general gene symbol instead of Ensembl ID
    id2symbol=ensembl_id_to_symbol,
    # number of genes to print, or True for printing all (including count of genes)
    study_items=None,
    # number of genes to print per line
    items_p_line=6,
    # p-values to use for coloring of GO term nodes (argument name determined from code and testing against value "p_uncorrected")
    pval_name="p_fdr_bh",
)

# for all name spaces
for ns in ns2assoc.keys():
    # check if no GO terms were found to be significant
    if len([r for r in goea_results_sig if r.NS == ns]) == 0:
        fig = plt.figure(figsize=(12, 2))
        text = fig.text(
            0.5,
            0.5,
            "No plot generated, because no GO terms were found significant\n"
            "for name space {} and significance levels: genes ({}), GO terms ({}).\n"
            "You might want to check those levels and/or your intermediate data.".format(
                ns, fdr_level_gene, fdr_level_go_term
            ),
            ha="center",
            va="center",
            size=20,
        )
        fig.savefig(outplot_generic.replace("_{NS}.", "_{}.".format(ns)))
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("tidyverse")
library("ggpubr")
library("IHW")

number_of_groups <- 7

gene_data <- read_tsv(snakemake@input[[1]])
level_name <- snakemake@wildcards[["level"]]

# calculate covariate mean_obs for gene.aggregated data
if(level_name == "genes-aggregated") {
  gene_data <- gene_data %>%
    mutate(mean_obs = if_else(num_aggregated_transcripts > 0, sum_mean_obs_counts / num_aggregated_transcripts, 0), ens_gene = target_id)
}

# determine the appropriate number of groups for grouping in ihw calculation
tested_number_of_groups <- number_of_groups
ihw_test_grouping <- function(x){
  tryCatch(
    expr = {
      ihw(pval ~ mean_obs, data = gene_data, alpha = 0.1, nbins = x)
      return(TRUE)
    },
    error = function(e) {
      print(str_c("Number of groups was set too hight, trying grouping by ", tested_number_of_groups - 1, " groups."))
      return(FALSE)
    })
}

while(!ihw_test_grouping(tested_number_of_groups) && tested_number_of_groups > 0){
  tested_number_of_groups <- tested_number_of_groups -1
  ihw_test_grouping(tested_number_of_groups)
}

if (tested_number_of_groups < number_of_groups) {
  print(str_c("The chosen number of groups for ", level_name, " dataset was too large, instead IHW was calculated on ", tested_number_of_groups, " groups."))
  number_of_groups <- tested_number_of_groups
}

gene_data <- gene_data %>%
  drop_na(pval, mean_obs) %>%
  select(ens_gene, ext_gene, pval, mean_obs) %>%
  mutate(grouping = groups_by_filter(mean_obs, number_of_groups))

### diagnostic plots for covariate and grouping
#dispersion plot
dispersion <- ggplot(gene_data, aes(x = percent_rank(mean_obs), y = -log10(pval))) +
  geom_point() +
  ggtitle("IHW: scatter plot of p-values vs. mean of counts") +
  theme(plot.title = element_text(size = 12, face = "bold", hjust = 0.5)) +
  xlab("percent rank of mean_obs") +
  ylab(expression(-log[10](p-value)))

ggsave(snakemake@output[["dispersion"]], dispersion)

#histograms
hist_overall <- ggplot(gene_data, aes(x = pval)) +
  geom_histogram(binwidth = 0.025, boundary = 0) +
  xlab("p-values without grouping") +
  ylab("density")

hist_groups <- ggplot(gene_data, aes(x = pval)) +
  geom_histogram(binwidth = 0.025, boundary = 0) +
  xlab("p-values of the individual groups") +
  ylab("density") +
  facet_wrap(~ grouping, nrow = ceiling(number_of_groups / 4)) 

plots_agg_mean_obs <- ggarrange(hist_overall, hist_groups, nrow = 1)

histograms <- annotate_figure(plots_agg_mean_obs,
                              top = text_grob("IHW: histograms for p-values of mean of counts",
                                              color = "black",
                                              face = "bold",
                                              size = 12))

ggexport(histograms,
         filename = snakemake@output[["histograms"]],
         width=14)

# ihw calculation
ihw_results_mean <- ihw(pval ~ mean_obs, data = gene_data, alpha = 0.1, nbins = tested_number_of_groups)

# merging ens_gene-IDs and ext_gene-names
ihw_mean <- as.data.frame(ihw_results_mean) %>% 
  # TODO remove ugly hack if ihw in future allows annotation columns (feature requested here: https://support.bioconductor.org/p/129972/)
  right_join(gene_data, by = c(pvalue = "pval", covariate = "mean_obs", group = "grouping")) %>%
  unique() %>%
  select(ens_gene, ext_gene, everything())

write_tsv(ihw_mean, snakemake@output[["results"]])

### diagnostic plots for ihw calculation
# plot of trends of the covariate
plot_trend_mean <- plot(ihw_results_mean) +
  ggtitle("IHW: Plot of trends of mean of counts") +
  theme(plot.title = element_text(size=12))
ggsave(snakemake@output[["trends"]], plot_trend_mean)

# plots of decision boundaries for unweighted p-values as a function of the covariate
plot_decision_mean <- plot(ihw_results_mean, what = "decisionboundary") +
  ggtitle("IHW: Decision boundaries for unweighted p-values vs. mean of counts") +
  theme(plot.title = element_text(size=12))
ggsave(snakemake@output[["decision"]], plot_decision_mean)

# p-values vs. adusted p-values
plot_ihw_pval_mean <- ggplot(ihw_mean, aes(x = pvalue, y = adj_pvalue, col = group)) +
  geom_point(size = 0.25) +
  ggtitle("IHW: raw p-values vs. adusted p-values from ihw-analysis") +
  theme(plot.title = element_text(size=12)) +
  scale_colour_hue(l = 70, c = 150, drop = FALSE)
ggsave(snakemake@output[["adj_pvals"]], plot_ihw_pval_mean)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("tidyverse")
library("IsoformSwitchAnalyzeR")

results <- read_rds(snakemake@input[["rds"]])

results <- analyzePFAM(
    switchAnalyzeRlist = results, 
    pathToPFAMresultFile = snakemake@input[["pfam"]],
    quiet = TRUE
)

results <- analyzeCPAT(
    switchAnalyzeRlist = results,
    pathToCPATresultFile = snakemake@input[["cpat"]],
    codingCutoff = snakemake@params[["coding_cutoff"]],
    removeNoncodinORFs = snakemake@params[["remove_noncoding_orfs"]],
    quiet = TRUE
)

results <- analyzeAlternativeSplicing(
    results, 
    quiet = FALSE, 
    onlySwitchingGenes = FALSE,
)

dir.create(snakemake@output[["plots_with"]])
dir.create(snakemake@output[["plots_without"]])

if(nrow(results$isoformFeatures) > 0) {
    results <- analyzeSwitchConsequences(
        results, 
        consequencesToAnalyze = c(
            'intron_retention',
            'coding_potential',
            # 'ORF_seq_similarity', TODO this is only needed for assembly, reactivate then
            'NMD_status',
            'domains_identified'
        ),
        onlySigIsoforms = FALSE,
        removeNonConseqSwitches = FALSE,
        quiet = TRUE, # set to TRUE to circumvent a bug leading to a stop() if there are no significant switches
        alpha = snakemake@params[["fdr"]],
        dIFcutoff = snakemake@params[["min_effect_size"]],
    )

    switchPlotTopSwitches(
        switchAnalyzeRlist = results,
        n = Inf,
        filterForConsequences = FALSE,
        splitComparison = FALSE,
        splitFunctionalConsequences = TRUE,
        pathToOutput = snakemake@params[["plotdir"]],
        alpha = snakemake@params[["fdr"]],
        dIFcutoff = snakemake@params[["min_effect_size"]],
        onlySigIsoforms = FALSE,
    )
}

significant <- extractTopSwitches(
    results,
    filterForConsequences = FALSE,
    extractGenes = TRUE,
    n = Inf,
    sortByQvals = TRUE,
    alpha = snakemake@params[["fdr"]],
    dIFcutoff = snakemake@params[["min_effect_size"]],
)

write_tsv(significant, snakemake@output[["table"]])
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("tidyverse")
library("IsoformSwitchAnalyzeR")

model <- snakemake@params[["model"]]

m <- read_rds(snakemake@input[["designmatrix"]])
is_prefix_col <- startsWith(colnames(m), model[["primary_variable"]])
colnames(m) <- replace(colnames(m), is_prefix_col, "condition")
colnames(m) <- replace(colnames(m), colnames(m) == "sample", "sampleID")
m <- m %>%
      dplyr::select("sampleID", "condition", everything())

quant <- importIsoformExpression(
   sampleVector = set_names(paste(snakemake@input[["kallisto"]], "abundance.tsv", sep="/"), snakemake@params[["samples"]]),
   addIsofomIdAsColumn = TRUE
)

candidates <- importRdata(
    isoformCountMatrix = quant$counts,
    isoformRepExpression = quant$abundance,
    designMatrix = m,
    isoformExonAnnoation = snakemake@input[["gtf"]],
    isoformNtFasta = snakemake@input[["fasta"]],
    showProgress = FALSE
)

# TODO make cutoffs configurable
filtered_candidates <- preFilter(
    switchAnalyzeRlist = candidates,
    geneExpressionCutoff = 1,
    isoformExpressionCutoff = 0,
    removeSingleIsoformGenes = TRUE
)

results <- isoformSwitchTestDEXSeq(
    switchAnalyzeRlist = filtered_candidates,
    reduceToSwitchingGenes = FALSE,
)

# get significant genes (have to do this manually, because IsoformSwitchAnalyzeR exits with an error
# in case of no significant genes).
keep <- unique(
    results$isoformFeatures$gene_ref[which(
        results$isoformFeatures$isoform_switch_q_value <
            snakemake@params[["fdr"]] &
            abs(results$isoformFeatures$dIF) > snakemake@params[["min_effect_size"]]
    )]
)

# if(length(keep) == 0) {
#     # No significant genes left, just keep the first to keep the analysis going.
#     keep <- results$isoformFeatures$gene_ref[1]
# }

# filter to significant genes
results <- subsetSwitchAnalyzeRlist(results, results$isoformFeatures$gene_ref %in% keep)

extractSequence(
    results,
    pathToOutput = snakemake@params[["seq_dir"]],
    addToSwitchAnalyzeRlist = TRUE,
    onlySwitchingGenes = FALSE,
)

write_rds(results, file = snakemake@output[[1]], compress = "gz")
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
from turtle import title
import altair as alt
from altair_saver import save
import pandas as pd
import numpy as np
import pysam
from scipy import stats
import sys
import json
import glob
import os

sys.stderr = open(snakemake.log[0], "w")

# reading the file read length
f = open(snakemake.input["read_length"])
read_length = json.load(f)
f.close()

# Get the transcript IDs if executing for individual transcripts or else for all transcripts
transcript_ids = snakemake.params["each_transcript"]

# Getting the bin files
fwrd_full = []
fwrd_trim = []
fwrd_fil = []

rev_full = []
rev_trim = []
rev_fil = []

# Append all sample bin files into a single dataframe
if transcript_ids != "all":
    fwrd_allsamp_hist_fil = snakemake.input["fwrd_allsamp_hist_fil"]
    rev_allsamp_hist_fil = snakemake.input["rev_allsamp_hist_fil"]
    if fwrd_allsamp_hist_fil != "":
        for filename in fwrd_allsamp_hist_fil:
            df = pd.read_csv(filename, index_col=None, header=0, sep="\t")
            fwrd_fil.append(
                df.loc[
                    :, df.columns.isin(["sample_Name", "Freq_forward", "bins_foward"])
                ]
            )
        fwrd_allsamp_hist_fil = pd.concat(fwrd_fil, axis=0, ignore_index=True)

    if rev_allsamp_hist_fil != "":
        for filename in rev_allsamp_hist_fil:
            df = pd.read_csv(filename, index_col=None, header=0, sep="\t")
            rev_fil.append(
                df.loc[:, df.columns.isin(["sample_Name", "Freq_rev", "bins_rev"])]
            )
        rev_allsamp_hist_fil = pd.concat(rev_fil, axis=0, ignore_index=True)
else:
    file_fwrd_allsamp_hist = snakemake.input["fwrd_allsamp_hist"]
    file_fwrd_allsamp_hist_trim = snakemake.input["fwrd_allsamp_hist_trim"]
    file_rev_allsamp_hist = snakemake.input["rev_allsamp_hist"]
    file_rev_allsamp_hist_trim = snakemake.input["rev_allsamp_hist_trim"]

    for filename in file_fwrd_allsamp_hist:
        df = pd.read_csv(filename, index_col=None, header=0, sep="\t")
        fwrd_full.append(df)
    fwrd_allsamp_hist = pd.concat(fwrd_full, axis=0, ignore_index=True)

    for filename in file_fwrd_allsamp_hist_trim:
        df = pd.read_csv(filename, index_col=None, header=0, sep="\t")
        fwrd_trim.append(df)
    fwrd_allsamp_hist_trim = pd.concat(fwrd_trim, axis=0, ignore_index=True)

    for filename in file_rev_allsamp_hist:
        df = pd.read_csv(filename, index_col=None, header=0, sep="\t")
        rev_full.append(df)
    rev_allsamp_hist = pd.concat(rev_full, axis=0, ignore_index=True)

    for filename in file_rev_allsamp_hist_trim:
        df = pd.read_csv(filename, index_col=None, header=0, sep="\t")
        rev_trim.append(df)
    rev_allsamp_hist_trim = pd.concat(rev_trim, axis=0, ignore_index=True)

# Plot the qc-histogram

if transcript_ids != "all":
    if not fwrd_allsamp_hist_fil.empty:

        hist_fwrd_full = (
            alt.Chart(fwrd_allsamp_hist_fil)
            .mark_line(interpolate="step-after")
            .encode(
                x=alt.X(
                    "bins_foward",
                    title="difference between transcript length and read start",
                ),
                y=alt.Y("Freq_forward:Q", title="Count of Records"),
                color="sample_Name",
            )
            .properties(title="forward strand transcripts (full length)")
        )

        fwrd_allsamp_hist_fil["read_length"] = read_length

        cht_rd_len_fwd_full = (
            alt.Chart(fwrd_allsamp_hist_fil)
            .mark_rule(color="black", strokeDash=[3, 5])
            .encode(x="read_length")
        )

        Fwd_chart_full = hist_fwrd_full + cht_rd_len_fwd_full
        Fwd_chart_full.save(snakemake.output["full_sample_QC"])

    elif not rev_allsamp_hist_fil.empty:
        hist_rev_full = (
            alt.Chart(rev_allsamp_hist_fil)
            .mark_line(interpolate="step-after")
            .encode(
                x=alt.X("bins_rev", title="read start position in the transcript"),
                y=alt.Y("Freq_rev:Q", title="Count of Records"),
                color="sample_Name",
            )
            .properties(title="Reverse strand transcripts (full length)")
        )

        rev_allsamp_hist_fil["read_length"] = read_length

        cht_rd_len_rev = (
            alt.Chart(rev_allsamp_hist_fil)
            .mark_rule(color="black", strokeDash=[3, 5])
            .encode(x="read_length")
        )

        Rev_chart_full = hist_rev_full + cht_rd_len_rev

        Rev_chart_full.save(snakemake.output["full_sample_QC"])

    elif fwrd_allsamp_hist_fil.empty:
        # Configure empty histogram (forward strand)
        empty_histogram = alt.Chart().mark_text(text="no reads aligned", size=20)
        empty_histogram.save(snakemake.output["full_sample_QC"])
    # Configure empty histogram (reverse strand)
    elif rev_allsamp_hist_fil.empty:
        empty_histogram = alt.Chart().mark_text(text="no reads aligned", size=20)
        empty_histogram.save(snakemake.output["full_sample_QC"])


else:
    # Histogram for forward strand
    # Histogram for full len transcript
    hist_fwrd_full = (
        alt.Chart(fwrd_allsamp_hist)
        .mark_line(interpolate="step-after")
        .encode(
            x=alt.X(
                "bins_foward",
                title="difference between transcript length and read start",
            ),
            y=alt.Y("Freq_forward:Q", title="Count of Records"),
            color="sample_Name",
        )
        .properties(title="forward strand transcripts (full length)")
    )
    fwrd_allsamp_hist["read_length"] = read_length
    cht_rd_len_fwd_full = (
        alt.Chart(fwrd_allsamp_hist)
        .mark_rule(color="black", strokeDash=[3, 5])
        .encode(x="read_length")
    )
    Fwd_chart_full = hist_fwrd_full + cht_rd_len_fwd_full

    # Histogram plot for 20000 bp len transcript
    hist_fwrd_trimd = (
        alt.Chart(fwrd_allsamp_hist_trim)
        .mark_line(interpolate="step-after")
        .encode(
            x=alt.X(
                "bins_foward",
                title="difference between transcript length and read start",
            ),
            y=alt.Y("Freq_forward:Q", title="Count of Records"),
            color="sample_Name",
        )
        .properties(title="forward strand transcripts (showing 1-20000bp)")
    )
    fwrd_allsamp_hist_trim["read_length"] = read_length
    cht_rd_len_fwd_trim = (
        alt.Chart(fwrd_allsamp_hist_trim)
        .mark_rule(color="black", strokeDash=[3, 5])
        .encode(x="read_length")
    )
    Fwd_chart_trim = hist_fwrd_trimd + cht_rd_len_fwd_trim

    Fwd_chart = alt.hconcat(Fwd_chart_trim, Fwd_chart_full)

    # Histogram for reverse strand
    # Histogram for full len transcript
    hist_rev_full = (
        alt.Chart(rev_allsamp_hist)
        .mark_line(interpolate="step-after")
        .encode(
            x=alt.X("bins_rev", title="read start position in the transcript"),
            y=alt.Y("Freq_rev:Q", title="Count of Records"),
            color="sample_Name",
        )
        .properties(title="Reverse strand transcripts (full length)")
    )
    rev_allsamp_hist["read_length"] = read_length
    cht_rd_len_rev = (
        alt.Chart(rev_allsamp_hist)
        .mark_rule(color="black", strokeDash=[3, 5])
        .encode(x="read_length")
    )
    rev_chart_full = hist_rev_full + cht_rd_len_rev

    # Histogram plot for 20000 bp len transcript
    hist_rev_trim = (
        alt.Chart(rev_allsamp_hist_trim)
        .mark_line(interpolate="step-after")
        .encode(
            x=alt.X("bins_rev", title="read start position in the transcript"),
            y=alt.Y("Freq_rev:Q", title="Count of Records"),
            color="sample_Name",
        )
        .properties(title="Reverse strand transcripts (showing 1-20000bp)")
    )
    rev_allsamp_hist_trim["read_length"] = read_length
    cht_rd_len_rev_trim = (
        alt.Chart(rev_allsamp_hist_trim)
        .mark_rule(color="black", strokeDash=[3, 5])
        .encode(x="read_length")
    )
    rev_chart_trim = hist_rev_trim + cht_rd_len_rev_trim

    Rev_chart = alt.hconcat(rev_chart_trim, rev_chart_full)

    Final_chart = alt.vconcat(Fwd_chart, Rev_chart)
    Final_chart.save(snakemake.output["full_sample_QC"])
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

library("tidyverse")
library("sleuth")

so <- sleuth_load(snakemake@input[["so"]])

top_n <- -strtoi(snakemake@params["top_n"])

results <- read_tsv(snakemake@input[["transcripts"]])

top_genes <- results %>%
   filter(qval <= snakemake@params[["fdr"]]) %>%
   top_n(top_n, qval) %>%
   dplyr::select(ext_gene) %>%
   drop_na() %>%
   distinct(ext_gene)


if (snakemake@params[["genes"]]$activate == TRUE) {
    gene_table <- read.csv(snakemake@params[["genes"]]$genelist,
        sep = "\t")
    names(gene_table) <- c("ext_gene")
    genes_of_interest <- tibble(gene_table) %>%
        distinct(ext_gene)
} else {
    # "genes" is null, if the list provided in config.yaml is empty
    genes_of_interest <- tibble(ext_gene = character())
}
genes <- full_join(top_genes, genes_of_interest, by = "ext_gene") %>%
    add_row(ext_gene = "Custom") %>%
    pull(ext_gene)

dir.create(snakemake@output[[1]])

for (gene in genes) {
    transcripts <- results %>%
        filter(ext_gene == gene) %>%
        drop_na() %>%
        pull(target_id)

    if (length(transcripts > 0)) {
        for (transcript in transcripts) {
            plot_bootstrap(so, transcript, color_by = snakemake@params[["color_by"]], units = "est_counts")
            ggsave(file = str_c(snakemake@output[[1]], "/", gene, ".", transcript, ".", snakemake@wildcards[["model"]], ".bootstrap.pdf"))
        }
    }
}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")


library(pheatmap)
library(dplyr)
library(tidyr)
# Reading the sleuth log count matrix file
sleuth_file <- read.csv(snakemake@input[["logcountmatrix_file"]],
    sep = "\t", header = TRUE
)
# Check the config file if it contains pre-defined gene list

if (snakemake@wildcards[["mode"]] == "topn") {
    # Replacing the rownames with transcript Id and gene name
    rownames(sleuth_file) <- paste(
        sleuth_file$transcript,
        ":", sleuth_file$gene
    )
    sleuth_file$gene <- NULL
    sleuth_file$transcript <- NULL
    # Getting top variable genes
    vargenes <-
        apply(sleuth_file, 1, var)
    # Selecting top 50 variable genes
    selectedgenes <- sleuth_file[row.names(sleuth_file)
    %in% names(vargenes[order(vargenes, decreasing = TRUE)][1:50]), ]
    # If the config file contains pre-defined gene list
} else if (snakemake@wildcards[["mode"]] == "predefined") {
    # Adding gene list to the variable
    predefine_genelist <-
        read.table(snakemake@input[["predef_genelist"]],
            sep = "\t"
        )
    selectedgenes <-
        sleuth_file %>% filter(sleuth_file$gene
            %in% predefine_genelist$V1)
    rownames(selectedgenes) <- paste(
        selectedgenes$transcript,
        ":", selectedgenes$gene
    )
    selectedgenes$gene <- NULL
    selectedgenes$transcript <- NULL
}
# Checks if selectedgenes variable contain genes that have expression values
if (all(selectedgenes == 0)) {
    txt <- "cannot plot, all values are zero"
    pdf(snakemake@output[["diffexp_heatmap"]], height = 10, width = 10)
    plot.new()
    text(.5, .5, txt, font = 2, cex = 1.5)
    dev.off()
    # If number of transcripts is more than 100,
    # Since for RNA-seq multiple transcripts may present for a single gene.
} else if (nrow(selectedgenes) > 100) {
    pdf_height <- round(nrow(selectedgenes) / 5)
    pdf(snakemake@output[["diffexp_heatmap"]],
        height = pdf_height, width = ncol(selectedgenes) * 2
    )
    pheatmap(selectedgenes, selectedgenes,
        cluster_rows = FALSE, display_numbers = TRUE,
        cellheight = 10, scale = "row"
    )
    dev.off()
} else {
    pdf_height <- round(nrow(selectedgenes) / 5)
    pdf(
        file = snakemake@output[["diffexp_heatmap"]],
        height = pdf_height, width = ncol(selectedgenes) * 2
    )
    pheatmap(selectedgenes, scale = "row")
    dev.off()
}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("sleuth")
library("ggplot2")
library("tidyr")

diffexp <- sleuth_load(snakemake@input[["diffexp_rds"]]) %>% drop_na(pval)

ggplot(diffexp) + geom_histogram(aes(pval), bins = 100)
ggsave(file = snakemake@output[[1]], width = 14)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("fgsea")

# provides library("tidyverse") and function get_prefix_col()
# the latter requires snakemake@output[["samples"]] and
# snakemake@params[["covariate"]]
source(snakemake@params[["common_src"]])


gene_sets <- gmtPathways(snakemake@input[["gene_sets"]])
sig_gene_sets <- read_tsv(snakemake@input[["sig_gene_sets"]])
diffexp <- read_tsv(snakemake@input[["diffexp"]]) %>%
                  drop_na(ext_gene) %>%
                  mutate(ext_gene = str_to_upper(ext_gene)) %>%
                  group_by(ext_gene) %>%
                  filter( qval == min(qval, na.rm = TRUE) ) %>%
                  mutate(target_id = str_c(target_id, collapse=",")) %>%
                  mutate(ens_gene = str_c(ens_gene, collapse=",")) %>%
                  distinct()

signed_pi <- get_prefix_col("signed_pi_value", colnames(diffexp))

ranked_genes <- diffexp %>%
                  dplyr::select(ext_gene, !!signed_pi) %>%
                  deframe()

dir.create( snakemake@output[[1]] )

for ( set in (sig_gene_sets %>% pull(pathway)) ) {
  # plot gene set enrichment
  if (length(gene_sets[[set]]) == 0 || is.na(ranked_genes[as.vector(gene_sets[[set]])])) {
    next
  }
  p <- plotEnrichment(gene_sets[[set]], ranked_genes) +
         ggtitle(str_c("gene set: ", set),
           subtitle = str_c(
             sig_gene_sets %>% filter(pathway == set) %>% pull(size)," genes;  ",
             "p-value (BH-adjusted): ", sig_gene_sets %>% filter(pathway ==  set) %>% pull(padj), "\n",
             "normalized enrichment score (NES):", sig_gene_sets %>% filter(pathway == set) %>% pull(NES)
             )
         ) +
         xlab("gene rank") +
         theme_bw( base_size = 16 )
  setname <- gsub("[-%/:,'\\. ]", "_", set)
  fname <- str_c(snakemake@wildcards[["model"]], ".", setname, ".gene-set-plot.pdf")
  ggsave(
    file = file.path( snakemake@output[[1]], fname ),
    width = 10,
    height = 7
  )
}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("sleuth")

so <- sleuth_load(snakemake@input[[1]])

pdf(file = snakemake@output[[1]])
plot_fld(so, paste0(snakemake@wildcards[["sample"]], "-", snakemake@wildcards[["unit"]]))
dev.off()
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("sleuth")
library("tidyverse")


so <- sleuth_load(snakemake@input[[1]])

plot_group_density(so,
                   use_filtered = TRUE,
                   units = "est_counts",
                   trans = "log",
                   grouping = setdiff(colnames(so$sample_to_covariates), "sample"),
                   offset = 1)
ggsave(snakemake@output[[1]])
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("sleuth")
library("ggpubr")

#principal components
pc <- 4

# plot pca
so <- sleuth_load(snakemake@input[[1]])
pc12 <- plot_pca(so, color_by = snakemake@wildcards[["covariate"]], units = "est_counts", text_labels = TRUE)
pc34 <- plot_pca(so, pc_x = 3, pc_y = 4, color_by = snakemake@wildcards[["covariate"]], units = "est_counts", text_labels = TRUE)

ggarrange(pc12, pc34, ncol = 2, common.legend = TRUE)
ggsave(snakemake@output[["pca"]], width=14)

# plot pc variance
plot_pc_variance(so, use_filtered = TRUE, units = "est_counts", pca_number = pc)
ggsave(snakemake@output[["pc_var"]], width=14)

# plot loadings
pc_loading_plots <- list()
for(i in 1:pc) {
  pc_loading_plots[[paste0("pc", i, "_loading")]] <- plot_loadings(so, 
                                                                   scale = TRUE, 
                                                                   pc_input = i,
                                                                   pc_count = 10, 
                                                                   units = "est_counts") +
    ggtitle(paste0(snakemake@wildcards[["covariate"]], ": plot loadings for principal component ", i)) +
    theme(plot.title = element_text(size = 14, face = "bold", hjust = 0.5))
}
plots_loading <- ggarrange(plotlist = pc_loading_plots, ncol = 1, nrow = 1, common.legend = TRUE)
ggexport(plots_loading, filename = snakemake@output[["loadings"]], width = 14)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("sleuth")
library("tidyverse")
library("ggpubr")

so <- sleuth_load(snakemake@input[[1]])
plot_list <- list()

# combinations between samples without repetition 

# sample_matrix <- t(combn(so$sample_to_covariates$sample, 2))
# combinations <- as.numeric(row(sample_matrix))
# 
# for(i in combinations) {
#   sample1 <- sample_matrix[i, 1]
#   sample2 <- sample_matrix[i, 2]
#   
#   plot_list[[i]] <- plot_scatter(so, 
#                                  sample_x = sample1, 
#                                  sample_y = sample2, 
#                                  use_filtered = TRUE, 
#                                  units = "est_counts", 
#                                  point_alpha = 0.2, 
#                                  xy_line = TRUE, 
#                                  xy_line_color = "red") +
#     labs(x = sample1, y = sample2)
#   }


# all combinations between samples (with repitition)

samples <- so$sample_to_covariates$sample
sample_matrix <- crossing(x = samples, y = samples) # creates all combinations between samples
combinations <- as.numeric(row(sample_matrix))

for(i in combinations) {
  sample1 <- sample_matrix[i, 1]
  sample2 <- sample_matrix[i, 2]
  if(sample1 != sample2) {
    plot_list[[i]] <- plot_scatter(so,
                                   sample_x = sample1,
                                   sample_y = sample2,
                                   use_filtered = TRUE,
                                   units = "est_counts",
                                   point_alpha = 0.2,
                                   xy_line = TRUE,
                                   xy_line_color = "red") +
      labs(x = sample1, y = sample2)

  } else {
    x_val <- y_val <- c(0,0)
    test <- tibble(x_val, y_val)
    plot_list[[i]] <- ggplot(test, aes(x = x_val, y = y_val)) +
      theme_void() +
      geom_text(label = "") +
      annotate("text", label = sample1, x=0, y=0, colour = "blue", fontface = "bold")
  }
}

all_plots <- ggarrange(plotlist = plot_list)
plot_figure <- annotate_figure(all_plots, 
                               top = text_grob("log transformed scatter plots of different samples",
                                               color = "black", 
                                               face = "bold", 
                                               size = 14))
ggsave(snakemake@output[[1]], plot_figure)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("sleuth")

sl <- snakemake@params[['sig_level']]

so <- sleuth_load(snakemake@input[[1]])

# colors from colorblind-safe Brewer palette "Dark2":
# http://colorbrewer2.org/#type=qualitative&scheme=Dark2&n=3
cb_safe_green <- "#1B9E77"
cb_safe_red <- "#D95F02"

p <- plot_vars(so,
        test = NULL,
        test_type = "lrt",
        which_model = "full",
        sig_level = sl,
        point_alpha = 0.2,
        sig_color = cb_safe_red,
        xy_line = TRUE,
        xy_line_color = cb_safe_red,
        highlight = NULL,
        highlight_color = cb_safe_green
    )
ggsave(filename = snakemake@output[[1]], width = 7, height = 7)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import Bio
import sys
import re
from Bio import SeqIO

sys.stderr = open(snakemake.log[0], "w")

with open(snakemake.output[0], "w") as transcript_clean_cdna_fasta:
    for seq_record in SeqIO.parse(snakemake.input["ref_fasta"], "fasta"):
        transcript_location = seq_record.description.split(" ")[2]
        strand = transcript_location.split(":")[5]
        if strand == "1":
            polyrem_seq = re.sub("TTTT+$|AAAA+$", "", str(seq_record.seq))
            print(
                ">",
                seq_record.id,
                "_1 ",
                seq_record.description,
                sep="",
                file=transcript_clean_cdna_fasta,
            )
            print(polyrem_seq, file=transcript_clean_cdna_fasta)
        elif strand == "-1":
            polyrem_seq = re.sub("^TTTT+|^AAAA+", "", str(seq_record.seq))
            print(
                ">",
                seq_record.id,
                "_-1 ",
                seq_record.description,
                sep="",
                file=transcript_clean_cdna_fasta,
            )
            print(polyrem_seq, file=transcript_clean_cdna_fasta)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
import Bio
import sys
import re
from Bio import SeqIO

sys.stderr = open(snakemake.log[0], "w")

with open(snakemake.output[0], "w") as three_prime_output_file:
    for seq_record in SeqIO.parse(snakemake.input["ref_fasta"], "fasta"):
        transcript_location = seq_record.description.split(" ")[0]
        # Split the transcript id by `_` to filter the strand info from transcript id
        transcript_id = transcript_location.split("_")[0]
        print(">", transcript_id, sep="", file=three_prime_output_file)
        print(seq_record.seq, file=three_prime_output_file)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("sleuth")
library("tidyverse")
library("fs")
library("grid")
library("gridExtra")
library("dplyr")
model <- snakemake@params[["model"]]

sleuth_object <- sleuth_load(snakemake@input[[1]])

sleuth_object <- sleuth_lrt(sleuth_object, "reduced", "full")

# plot mean-variance
mean_var_plot <- plot_mean_var(sleuth_object, 
                               which_model = "full", 
                               point_alpha = 0.4,
                               point_size = 2, 
                               point_colors = c("black", "dodgerblue"),
                               smooth_alpha = 1, 
                               smooth_size = 0.75, 
                               smooth_color = "red")
ggsave(snakemake@output[["mean_var_plot"]], mean_var_plot)

write_results <- function(so, mode, output, output_all) {
    so$pval_aggregate <- FALSE
    if(mode == "aggregate") {
      # workaround the following bug-request:
      # https://github.com/pachterlab/sleuth/pull/240
      # TODO renaming can be removed when fixed
      g_col <- so$gene_column
      so$gene_column <- NULL
      so$pval_aggregate <- TRUE
      so$gene_column <- g_col
    }

    plot_model <- snakemake@wildcards[["model"]]

    # list for qq-plots to make a multipage pdf-file as output
    qq_list <- list()

    print("Performing likelihood ratio test")
    all <- sleuth_results(so, "reduced:full", "lrt", show_all = TRUE, pval_aggregate = so$pval_aggregate) %>%
            arrange(pval)
    all<- all %>% mutate_if(is.numeric, signif, 3)
    covariates <- colnames(design_matrix(so, "full"))
    covariates <- covariates[covariates != "(Intercept)"]

    # iterate over all covariates and perform wald test in order to obtain beta estimates
    if(!so$pval_aggregate) {

      # lists for volcano and ma-plots to make a multipage pdf-file as output
      volcano_list <- list()
      ma_list <- list()

      for(covariate in covariates) {
            print(str_c("Performing wald test for ", covariate))
            so <- sleuth_wt(so, covariate, "full")

            volc_plot_title <- str_c(plot_model, ": volcano plot for ", covariate)
            ma_plot_title <- str_c(plot_model, ": ma-plot for ", covariate)
            qq_plot_title <- str_c(plot_model, ": qq-plot from wald test for ", covariate)

            # volcano plot
            print(str_c("Performing volcano plot for ", covariate))
            volcano <- plot_volcano(so, covariate, "wt", "full",
                                    sig_level = snakemake@params[["sig_level_volcano"]], 
                                    point_alpha = 0.2, 
                                    sig_color = "red",
                                    highlight = NULL) +
              ggtitle(volc_plot_title) +
              theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5))
            volcano_list[[volc_plot_title]] <- volcano

            # ma-plot
            print(str_c("Performing ma-plot for ", covariate))
            ma_plot <- plot_ma(so, covariate, "wt", "full",
                               sig_level = snakemake@params[["sig_level_ma"]], 
                               point_alpha = 0.2, 
                               sig_color = "red",
                               highlight = NULL, 
                               highlight_color = "green") +
              ggtitle(ma_plot_title) +
              theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5))
            ma_list[[ma_plot_title]] <- ma_plot

            # qq-plots from wald test
            print(str_c("Performing qq-plot from wald test for ", covariate))
            qq_plot <- plot_qq(so, covariate, "wt", "full", 
                               sig_level = snakemake@params[["sig_level_qq"]],
                               point_alpha = 0.2, 
                               sig_color = "red", 
                               highlight = NULL, 
                               highlight_color = "green",
                               line_color = "blue") +
              ggtitle(qq_plot_title) +
              theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5))
            qq_list[[qq_plot_title]] <- qq_plot


            beta_col_name <- str_c("b", covariate, sep = "_")
            beta_se_col_name <- str_c(beta_col_name, "se", sep = "_")
            qval_name <- str_c("qval", covariate, sep = "_")
            all_wald <- sleuth_results(so, covariate, "wt", show_all = TRUE, pval_aggregate = FALSE) %>%
                        dplyr::select( target_id = target_id,
                                !!qval_name := qval,
                                !!beta_col_name := b,
                                !!beta_se_col_name := se_b)
            signed_pi_col_name <- str_c("signed_pi_value", covariate, sep = "_")
            all <- inner_join(all, all_wald, by = "target_id") %>%
	              # calculate a signed version of the pi-value from:
                    # https://dx.doi.org/10.1093/bioinformatics/btr671
	              # e.g. useful for GSEA ranking
	              mutate( !!signed_pi_col_name := -log10(pval) * !!sym(beta_col_name) )
      }
      # saving volcano plots
      marrange_volcano <- marrangeGrob(grobs=volcano_list, nrow=1, ncol=1, top = NULL)
      ggsave(snakemake@output[["volcano_plots"]], plot = marrange_volcano, width = 14)

      # saving ma-plots
      marrange_ma <- marrangeGrob(grobs=ma_list, nrow=1, ncol=1, top = NULL)
      ggsave(snakemake@output[["ma_plots"]], plot = marrange_ma, width = 14)
    }

    if(mode == "mostsignificant") {
        # for each gene, select the most significant transcript (this is equivalent to sleuth_gene_table, but with bug fixes)
      all <- all %>%
                drop_na(ens_gene) %>%
                group_by(ens_gene) %>%
                filter( qval == min(qval, na.rm = TRUE) ) %>%
                # ties in qval (e.g. min(qval) == 1) can lead to multiple entries per gene
                filter( pval == min(pval, na.rm = TRUE) ) %>%
                # for min(qval) == 1, then usually also min(pval) == 1, and
                # we need something else to select a unique entry per gene
                filter( mean_obs == max(mean_obs, na.rm = TRUE) ) %>%
                # sometimes we still get two transcript variants with exactly
                # the same values, i.e. they have exactly the same sequence
                # but (slightly) different annotations -- then we retain a string
                # with a comma-separated list of them
                mutate(target_id = str_c(target_id, collapse=",")) %>%
                distinct() %>%
                # useful sort for scrolling through output by increasing q-values
                arrange(qval)
      all <- all %>% mutate_if(is.numeric, signif, 3)
      # qq-plot from likelihood ratio test
      print(str_c("Performing qq-plot from likelihood ratio test"))
      qq_plot_title_trans <- str_c(plot_model, ": qq-plot from likelihood ratio test")
      qq_plot_trans <- plot_qq(so, 
                                test = 'reduced:full', 
                                test_type = 'lrt', 
                                sig_level = snakemake@params[["sig_level_qq"]],
                                point_alpha = 0.2, 
                                sig_color = "red", 
                                highlight = NULL, 
                                highlight_color = "green",
                                line_color = "blue") +
        ggtitle(qq_plot_title_trans) +
        theme(plot.title = element_text(size = 16, face = "bold", hjust = 0.5))
        qq_list[[qq_plot_title_trans]] <- qq_plot_trans
    } else if (mode == "canonical") {
      all <- all %>%
                drop_na(canonical) %>%
                filter(canonical)
      if (nrow(all) == 0) {
        stop("No canonical transcripts found (does ensembl support canonical transcript annotation for your species?")
      }
      # Control FDR again, because we have less tests now.
      all$qval <- p.adjust(all$pval, method = "BH")
      all <- all %>% mutate_if(is.numeric, signif, 3)
    } else if (mode == "custom") {
      # load custom ID list
      id_version_pattern <- "\\.\\d+$"
      ids <- read_tsv(snakemake@params[["representative_transcripts"]], col_names = "ID")$ID
      ids <- str_replace(ids, id_version_pattern, "")
      all <- all %>% 
        mutate(target_id_stem = str_replace(target_id, id_version_pattern, "")) %>% 
        filter(target_id_stem %in% ids) %>% 
        mutate(target_id_stem = NULL)
      all <- all %>% mutate_if(is.numeric, signif, 3)
      if (nrow(all) == 0) {
        stop("The given list of representative transcript ids does not match any of the transcript ids of the chosen species.")
      }
    }

    # saving qq-plots
    marrange_qq <- marrangeGrob(grobs=qq_list, nrow=1, ncol=1, top = NULL)
    ggsave(snakemake@output[["qq_plots"]], plot = marrange_qq, width = 14)

    write_rds(all, path = output_all, compress = "none")
    # add sample expressions
    all <- all %>% left_join(as_tibble(sleuth_to_matrix(so, "obs_norm", "est_counts"), rownames="target_id"))
    all <- all %>% mutate_if(is.numeric, signif, 3)
    write_tsv(all, path = output, escape = "none")
}

write_results(sleuth_object, "transcripts", snakemake@output[["transcripts"]], snakemake@output[["transcripts_rds"]])
write_results(sleuth_object, "aggregate", snakemake@output[["genes_aggregated"]], snakemake@output[["genes_aggregated_rds"]])

repr_trans <- snakemake@params[["representative_transcripts"]]
if (repr_trans == "canonical") {
  write_results(sleuth_object, "canonical", snakemake@output[["genes_representative"]], snakemake@output[["genes_representative_rds"]])
} else if (repr_trans == "mostsignificant") {
  write_results(sleuth_object, "mostsignificant", snakemake@output[["genes_representative"]], snakemake@output[["genes_representative_rds"]])
} else {
  write_results(sleuth_object, "custom", snakemake@output[["genes_representative"]], snakemake@output[["genes_representative_rds"]])
}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("sleuth")
library("tidyverse")

model <- snakemake@params[["model"]]

samples <- read_tsv(snakemake@input[["samples"]], na = "", col_names = TRUE) %>%
            # make everything except the index, sample name and path string a factor
            mutate_at(  vars(-sample, -path),
                        list(~factor(.))
                        )

if(!is.null(snakemake@params[["exclude"]])) {
    samples <- samples %>%
                filter( !sample %in% snakemake@params[["exclude"]] )
}

samples_out <- if(!is.null(model[["full"]])) {
    # retrieve the model formula
    formula <- as.formula(model[["full"]])
    # extract variables from the formula and unnest any nested variables
    variables <- labels(terms(formula)) %>%
                    strsplit('[:*]') %>%
                    unlist()
    # remove samples with an NA value in any of the columns
    # relevant for sleuth under the current model
    samples <- samples %>%
	        drop_na(c(sample, path, all_of(variables)))

    primary_variable <- model[["primary_variable"]]
    base_level <- model[["base_level"]]
    # TODO migrate this to tidyverse
    # Ensure that primary variable factors are sorted such that base_level comes first.
    # This is important for fold changes, effect sizes to have the expected sign.
    samples[, primary_variable] <- relevel(samples[, primary_variable, drop=TRUE], base_level)

    samples %>% select(c(sample, all_of(variables)))
} else {
    samples %>% select(-path)
}

# store design matrix
write_rds(samples_out, file = snakemake@output[["designmatrix"]])

# remove all columns which have only NA values
samples <- samples %>%
	    select_if(function(col) !all(is.na(col)))

t2g <- read_rds(snakemake@input[["transcript_info"]])

so <- sleuth_prep(  samples,
                    extra_bootstrap_summary = TRUE,
                    target_mapping = t2g,
                    aggregation_column = "ens_gene",
                    read_bootstrap_tpm = TRUE,
                    transform_fun_counts = function(x) log2(x + 0.5),
                    num_cores = snakemake@threads
                    )

if ("canonical" %in% colnames(so$target_mapping)) {
    # Sleuth converts all columns to characters. We don't want that for the canonical column.
    # Hence we fix it here.
    so$target_mapping$canonical <- as.logical(so$target_mapping$canonical)
}
custom_transcripts <- so$obs_raw %>%
                        # find transcripts not in the target_mapping
                        filter(!target_id %in% so$target_mapping$target_id) %>%
                        # make it a succinct list with no repititions
                        distinct(target_id) %>%
                        # pull it out into a vector
                        pull(target_id)

if(!length(custom_transcripts) == 0) {
    so$target_mapping <- so$target_mapping %>%
                        # add those custom transcripts as rows to the target mapping
                        add_row(ens_gene = NA, ext_gene = "Custom", target_id = custom_transcripts, canonical = NA)
}

if(!is.null(model[["full"]])) {
    so <- sleuth_fit(so, as.formula(model[["full"]]), 'full')
    so <- sleuth_fit(so, as.formula(model[["reduced"]]), 'reduced')
}

sleuth_save(so, snakemake@output[["sleuth_object"]])
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
log <- file(snakemake@log[[1]], open="wt")
sink(log)
sink(log, type="message")

library("sleuth")
library("limma")
library("tidyverse")

so <- sleuth_load(snakemake@input[[1]])

# get normed counts
norm_counts <- sleuth_to_matrix(so, "obs_norm", "est_counts")

# log transform
log_norm_counts <- log2(norm_counts + 1)

# reorder columns to match covariates
log_norm_counts <- log_norm_counts[, so$sample_to_covariates$sample, drop=FALSE]

# obtain covariates
model <- snakemake@params[["model"]]
model_nobatch <- paste("~", model[["primary_variable"]], sep="")

covariates <- model.matrix(as.formula(model[["reduced"]]), data = so$sample_to_covariates)
design <- model.matrix(as.formula(model_nobatch), data = so$sample_to_covariates)

stopifnot(so$sample_to_covariates$sample == colnames(log_norm_counts))

# remove batch effects
final_counts <- removeBatchEffect(log_norm_counts, covariates = covariates, design = design)

target_mapping <- so$target_mapping
rownames(target_mapping) <- target_mapping$target_id

final_counts <- rownames_to_column(as.data.frame(final_counts), var="transcript") %>% 
                add_column(
                    gene = target_mapping[rownames(final_counts), "ext_gene"],
                    .after = "transcript"
                )


write_tsv(final_counts, snakemake@output[[1]])
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
log <- file(snakemake@log[[1]], open = "wt")
sink(log)
sink(log, type = "message")

library("SPIA")
library("graphite")
library(snakemake@params[["bioc_species_pkg"]], character.only = TRUE)

# provides library("tidyverse") and functions load_bioconductor_package() and
# get_prefix_col(), the latter requires snakemake@output[["samples"]] and
# snakemake@params[["covariate"]]
source(snakemake@params[["common_src"]])

pw_db <- snakemake@params[["pathway_db"]]
db <- readRDS(snakemake@input[["spia_db"]])

options(Ncpus = snakemake@threads)

diffexp <- read_tsv(snakemake@input[["diffexp"]]) %>%
    drop_na(ens_gene) %>%
    mutate(ens_gene = str_c("ENSEMBL:", ens_gene))
universe <- diffexp %>% pull(var = ens_gene)
sig_genes <- diffexp %>% filter(qval <= 0.05)
if (nrow(sig_genes) == 0) {
    cols <- c(
        "Name", "Status", "Combined FDR",
        "total perturbation accumulation", "number of genes on the pathway",
        "number of DE genes per pathway", "p-value for at least NDE genes",
        "Combined Bonferroni p-values",
        "p-value to observe a total accumulation", "Combined p-value", "Ids"
    )
    res <- data.frame(matrix(ncol = 11, nrow = 0, dimnames = list(NULL, cols)))
    # create empty perturbation plots
    pdf(file = snakemake@output[["plots"]])
    write_tsv(res, snakemake@output[["table"]])
    write_tsv(res, snakemake@output[["table_activated"]])
    write_tsv(res, snakemake@output[["table_inhibited"]])
    dev.off()
} else {
    # get logFC equivalent (the sum of beta scores of covariates of interest)

    beta_col <- get_prefix_col("b", colnames(sig_genes))

    beta <- sig_genes %>%
        dplyr::select(ens_gene, !!beta_col) %>%
        deframe()

    t <- tempdir(check = TRUE)
    olddir <- getwd()
    setwd(t)
    prepareSPIA(db, pw_db)
    res <- runSPIA(
        de = beta, all = universe, pw_db,
        plots = TRUE, verbose = TRUE
    )
    setwd(olddir)

    file.copy(
        file.path(t, "SPIAPerturbationPlots.pdf"),
        snakemake@output[["plots"]]
    )
    pathway_names <- db[res$Name]
    pathway_names <- db[res$Name]
    path_ids <- as.matrix(lapply(pathway_names@entries, slot, "id"))
    if (length(path_ids) > 0) {
        path_ids_data_frame <-
            data.frame(Ids = matrix(unlist(path_ids),
                nrow = length(path_ids), byrow = TRUE
            ))
        final_res <- cbind(res,
            Ids = path_ids_data_frame$Ids
        )
        res_reorder <- dplyr::select(
            final_res, Name, Status,
            pGFdr, tA, pSize, NDE, pNDE, pGFWER, pPERT, pG, Ids
        )
        res_reorder <- res_reorder %>%
            rename(
                "Combined Bonferroni p-values" = "pGFWER",
                "Combined FDR" = "pGFdr",
                "total perturbation accumulation" = "tA",
                "number of genes on the pathway" = "pSize",
                "number of DE genes per pathway" = "NDE",
                "Combined p-value" = "pG",
                "p-value to observe a total accumulation" = "pPERT",
                "p-value for at least NDE genes" = "pNDE"
            )
        write_tsv(res_reorder, snakemake@output[["table"]])
        sort_activated <- res_reorder[res_reorder$Status == "Activated", ]
        sort_inhibited <- res_reorder[res_reorder$Status == "Inhibited", ]
        write_tsv(sort_activated, snakemake@output[["table_activated"]])
        write_tsv(sort_inhibited, snakemake@output[["table_inhibited"]])
    } else {
        columns <- c(
            "Name", "Status", "Combined FDR", "total perturbation accumulation",
            "number of genes on the pathway", "number of DE genes per pathway",
            "p-value for at least NDE genes", "Combined Bonferroni p-values",
            "p-value to observe a total accumulation", "Combined p-value", "Ids"
        )
        emtpy_data_frame <- data.frame(matrix(nrow = 0, ncol = length(columns)))
        colnames(emtpy_data_frame) <- columns
        write_tsv(emtpy_data_frame, snakemake@output[["table"]])
        write_tsv(emtpy_data_frame, snakemake@output[["table_activated"]])
        write_tsv(emtpy_data_frame, snakemake@output[["table_inhibited"]])
    }
}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from string import Template
import pandas as pd
from io import StringIO


HTML = r"""
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <title></title>
</head>
<body>
    <div id="vis" style="width: 100%; height: 90vh;"></div>
    <script src="https://cdn.jsdelivr.net/npm/vega@5"></script>
    <script src="https://cdn.jsdelivr.net/npm/vega-lite@5"></script>
    <script src="https://cdn.jsdelivr.net/npm/vega-embed@6"></script>
    <script>
        var vega_specs = $json;
        vegaEmbed('#vis', vega_specs);
    </script>
</body>
</html>
"""


def main(snakemake):

    # read vega js file with template vars
    # `$data`, `$sig_level`, `$beta_column` and `$beta_se_column`
    with open(snakemake.input.spec, "rt") as f:
        spec = Template(f.read())

    sig_level = snakemake.params.sig_level_volcano
    primary_var = snakemake.params.primary_variable

    # find column that matches primary variable
    df: pd.DataFrame = pd.read_csv(snakemake.input.tsv, sep="\t")

    primary_cols = [
        c
        for c in list(df.columns)
        if c.startswith(f"b_{primary_var}") and not c.endswith("_se")
    ]
    if len(primary_cols) > 1:
        print("WARNING: found {len(primary_cols)} possible primary variables")
    beta_col = primary_cols[0]

    # only keep columns needed for plot
    df = df[["ens_gene", "ext_gene", "target_id", "qval", beta_col, beta_col + "_se"]]

    # nan / NA / None values do not get plotted, so remove respective rows
    df = df.dropna()

    data = StringIO()
    df.to_csv(data, sep="\t", index=False)

    # update the spec with concrete values
    json = spec.safe_substitute(
        data=data.getvalue().replace("\t", r"\t").replace("\n", r"\n"),
        sig_level=sig_level,
        beta_column=beta_col,
        beta_se_column=beta_col + "_se",
    )

    with open(snakemake.output.json, "wt") as f:
        f.write(json)



if __name__ == "__main__":
    main(snakemake)
ShowHide 83 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...