A Snakemake workflow for calling small and structural variants under any kind of scenario (tumor/normal, tumor/normal/relapse, germline, pedigree, populations) via the unified statistical model of Varlociraptor.

public public 1yr ago Version: v5.0.1 0 bookmarks

A Snakemake workflow for calling small and structural variants under any kind of scenario (tumor/normal, tumor/normal/relapse, germline, pedigree, populations) via the unified statistical model of Varlociraptor .

Usage

The usage of this workflow is described in the Snakemake Workflow Catalog .

If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this (original) repository and its DOI (see above).

Code Snippets

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
__author__ = "Christopher Schröder"
__copyright__ = "Copyright 2020, Christopher Schröder"
__email__ = "[email protected]"
__license__ = "MIT"

import tempfile
from snakemake.shell import shell
from snakemake_wrapper_utils.java import get_java_opts

extra = snakemake.params.get("extra", "")
spark_runner = snakemake.params.get("spark_runner", "LOCAL")
spark_master = snakemake.params.get(
    "spark_master", "local[{}]".format(snakemake.threads)
)
spark_extra = snakemake.params.get("spark_extra", "")
java_opts = get_java_opts(snakemake)

log = snakemake.log_fmt_shell(stdout=True, stderr=True)
known = snakemake.input.get("known", "")
if known:
    known = "--known-sites {}".format(known)

with tempfile.TemporaryDirectory() as tmpdir:
    shell(
        "gatk --java-options '{java_opts}' BaseRecalibratorSpark"
        " --input {snakemake.input.bam}"
        " --reference {snakemake.input.ref}"
        " {extra}"
        " --tmp-dir {tmpdir}"
        " --output {snakemake.output.recal_table} {known}"
        " -- --spark-runner {spark_runner} --spark-master {spark_master} {spark_extra}"
        " {log}"
    )
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell
from snakemake_wrapper_utils.bcftools import get_bcftools_opts


bcftools_opts = get_bcftools_opts(snakemake, parse_ref=False, parse_memory=False)
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)


shell("bcftools concat {bcftools_opts} {extra} {snakemake.input.calls} {log}")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
__author__ = "Patrik Smeds"
__copyright__ = "Copyright 2021, Patrik Smeds"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell
from snakemake_wrapper_utils.bcftools import get_bcftools_opts


bcftools_opts = get_bcftools_opts(
    snakemake, parse_ref=False, parse_samples=False, parse_memory=False
)
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)
filter = snakemake.params.get("filter", "")


if len(snakemake.output) > 1:
    raise Exception("Only one output file expected, got: " + str(len(snakemake.output)))


shell(
    "bcftools filter"
    " {bcftools_opts}"
    " {filter}"
    " {extra}"
    " {snakemake.input[0]}"
    " {log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell
from snakemake_wrapper_utils.bcftools import get_bcftools_opts

bcftools_opts = get_bcftools_opts(snakemake, parse_ref=False, parse_memory=False)
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

shell("bcftools view {bcftools_opts} {extra} {snakemake.input[0]} {log}")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2020, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

import os
from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=False, stderr=True)
url = (
    "https://github.com/lh3/CHM-eval/releases/"
    "download/{tag}/CHM-evalkit-{version}.tar"
).format(version=snakemake.params.version, tag=snakemake.params.tag)

os.makedirs(snakemake.output[0])
shell(
    "(curl -L {url} | tar --strip-components 1 -C {snakemake.output[0]} -xf - &&"
    "(cd {snakemake.output[0]}; chmod +x htsbox run-eval k8)) {log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2020, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

url = "ftp://ftp.sra.ebi.ac.uk/vol1/run/ERR134/ERR1341796/CHM1_CHM13_2.bam"

pipefail = ""
fmt = "-b"
prefix = snakemake.params.get("first_n", "")
if prefix:
    prefix = "| head -n {} | samtools view -h -b".format(prefix)
    fmt = "-h"
    pipefail = "set +o pipefail"

    shell(
        """
        {pipefail}
        {{
            samtools view {fmt} {url} {prefix} > {snakemake.output.bam}
            samtools index {snakemake.output.bam}
        }} {log}
        """
    )
else:
    shell(
        """
        {{
            curl -L {url} > {snakemake.output.bam}
            samtools index {snakemake.output.bam}
        }} {log}
        """
    )
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2020, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

kit = snakemake.input.kit
vcf = snakemake.input.vcf
build = snakemake.params.build
extra = snakemake.params.get("extra", "")

if not snakemake.output[0].endswith(".summary"):
    raise ValueError("Output file must end with .summary")
out = snakemake.output[0][:-8]

shell("({kit}/run-eval -g {build} -o {out} {extra} {vcf} | sh) {log}")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
__author__ = "Patrik Smeds"
__copyright__ = "Copyright 2016, Patrik Smeds"
__email__ = "[email protected]"
__license__ = "MIT"

from os.path import splitext

from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

# Check inputs/arguments.
if len(snakemake.input) == 0:
    raise ValueError("A reference genome has to be provided!")
elif len(snakemake.input) > 1:
    raise ValueError("Only one reference genome can be inputed!")

# Prefix that should be used for the database
prefix = snakemake.params.get("prefix", splitext(snakemake.output.idx[0])[0])

if len(prefix) > 0:
    prefix = "-p " + prefix

# Contrunction algorithm that will be used to build the database, default is bwtsw
construction_algorithm = snakemake.params.get("algorithm", "")

if len(construction_algorithm) != 0:
    construction_algorithm = "-a " + construction_algorithm

shell(
    "bwa index" " {prefix}" " {construction_algorithm}" " {snakemake.input[0]}" " {log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
__author__ = "Johannes Köster, Julian de Ruiter"
__copyright__ = "Copyright 2016, Johannes Köster and Julian de Ruiter"
__email__ = "[email protected], [email protected]"
__license__ = "MIT"


import tempfile
from os import path
from snakemake.shell import shell
from snakemake_wrapper_utils.java import get_java_opts
from snakemake_wrapper_utils.samtools import get_samtools_opts


# Extract arguments.
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)
sort = snakemake.params.get("sorting", "none")
sort_order = snakemake.params.get("sort_order", "coordinate")
sort_extra = snakemake.params.get("sort_extra", "")
samtools_opts = get_samtools_opts(snakemake, param_name="sort_extra")
java_opts = get_java_opts(snakemake)


index = snakemake.input.idx
if isinstance(index, str):
    index = path.splitext(snakemake.input.idx)[0]
else:
    index = path.splitext(snakemake.input.idx[0])[0]


# Check inputs/arguments.
if not isinstance(snakemake.input.reads, str) and len(snakemake.input.reads) not in {
    1,
    2,
}:
    raise ValueError("input must have 1 (single-end) or 2 (paired-end) elements")


if sort_order not in {"coordinate", "queryname"}:
    raise ValueError("Unexpected value for sort_order ({})".format(sort_order))


# Determine which pipe command to use for converting to bam or sorting.
if sort == "none":
    # Simply convert to bam using samtools view.
    pipe_cmd = "samtools view {samtools_opts}"

elif sort == "samtools":
    # Add name flag if needed.
    if sort_order == "queryname":
        sort_extra += " -n"

    # Sort alignments using samtools sort.
    pipe_cmd = "samtools sort {samtools_opts} {sort_extra} -T {tmpdir}"

elif sort == "picard":
    # Sort alignments using picard SortSam.
    pipe_cmd = "picard SortSam {java_opts} {sort_extra} --INPUT /dev/stdin --TMP_DIR {tmpdir} --SORT_ORDER {sort_order} --OUTPUT {snakemake.output[0]}"

else:
    raise ValueError(f"Unexpected value for params.sort ({sort})")

with tempfile.TemporaryDirectory() as tmpdir:
    shell(
        "(bwa mem"
        " -t {snakemake.threads}"
        " {extra}"
        " {index}"
        " {snakemake.input.reads}"
        " | " + pipe_cmd + ") {log}"
    )
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


n = len(snakemake.input)
assert n == 2, "Input must contain 2 (paired-end) elements."

extra = snakemake.params.get("extra", "")
adapters = snakemake.params.get("adapters", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)

assert (
    extra != "" or adapters != ""
), "No options provided to cutadapt. Please use 'params: adapters=' or 'params: extra='."

shell(
    "cutadapt"
    " --cores {snakemake.threads}"
    " {adapters}"
    " {extra}"
    " -o {snakemake.output.fastq1}"
    " -p {snakemake.output.fastq2}"
    " {snakemake.input}"
    " > {snakemake.output.qc} {log}"
)
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


n = len(snakemake.input)
assert n == 1, "Input must contain 1 (single-end) element."

extra = snakemake.params.get("extra", "")
adapters = snakemake.params.get("adapters", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)

assert (
    extra != "" or adapters != ""
), "No options provided to cutadapt. Please use 'params: adapters=' or 'params: extra='."

shell(
    "cutadapt"
    " --cores {snakemake.threads}"
    " {adapters}"
    " {extra}"
    " -o {snakemake.output.fastq}"
    " {snakemake.input[0]}"
    " > {snakemake.output.qc} {log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell
from snakemake_wrapper_utils.bcftools import get_bcftools_opts


bcftools_opts = get_bcftools_opts(snakemake, parse_ref=False, parse_memory=False)
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)


exclude = snakemake.input.get("exclude", "")
if exclude:
    exclude = f"-x {exclude}"


shell(
    "(OMP_NUM_THREADS={snakemake.threads} delly call"
    " -g {snakemake.input.ref}"
    " {exclude}"
    " {extra}"
    " {snakemake.input.alns} | "
    # Convert output to specified format
    "bcftools view"
    " {bcftools_opts}"
    ") {log}"
)
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from os import path
import re
from tempfile import TemporaryDirectory
from snakemake.shell import shell
from snakemake_wrapper_utils.snakemake import get_mem

extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)
# Define memory per thread (https://github.com/s-andrews/FastQC/blob/master/fastqc#L201-L222)
mem_mb = int(get_mem(snakemake, "MiB") / snakemake.threads)


def basename_without_ext(file_path):
    """Returns basename of file path, without the file extension."""

    base = path.basename(file_path)
    # Remove file extension(s) (similar to the internal fastqc approach)
    base = re.sub("\\.gz$", "", base)
    base = re.sub("\\.bz2$", "", base)
    base = re.sub("\\.txt$", "", base)
    base = re.sub("\\.fastq$", "", base)
    base = re.sub("\\.fq$", "", base)
    base = re.sub("\\.sam$", "", base)
    base = re.sub("\\.bam$", "", base)

    return base


# If you have multiple input files fastqc doesn't know what to do. Taking silently only first gives unapreciated results

if len(snakemake.input) > 1:
    raise IOError("Got multiple input files, I don't know how to process them!")

# Run fastqc, since there can be race conditions if multiple jobs
# use the same fastqc dir, we create a temp dir.
with TemporaryDirectory() as tempdir:
    shell(
        "fastqc"
        " --threads {snakemake.threads}"
        " --memory {mem_mb}"
        " {extra}"
        " --outdir {tempdir:q}"
        " {snakemake.input[0]:q}"
        " {log}"
    )

    # Move outputs into proper position.
    output_base = basename_without_ext(snakemake.input[0])
    html_path = path.join(tempdir, output_base + "_fastqc.html")
    zip_path = path.join(tempdir, output_base + "_fastqc.zip")

    if snakemake.output.html != html_path:
        shell("mv {html_path:q} {snakemake.output.html:q}")

    if snakemake.output.zip != zip_path:
        shell("mv {zip_path:q} {snakemake.output.zip:q}")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
__author__ = "Patrik Smeds"
__copyright__ = "Copyright 2018, Patrik Smeds"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell
from snakemake.io import Namedlist
from snakemake_wrapper_utils.java import get_java_opts

log = snakemake.log_fmt_shell(stdout=False, stderr=True)
extra_params = snakemake.params.get("extra", "")
java_opts = get_java_opts(snakemake)

bam_input = snakemake.input.bam

if bam_input is None:
    raise ValueError("Missing bam input file!")
elif not isinstance(bam_input, str):
    raise ValueError("Input bam should be a string: " + str(bam_input) + "!")

umi_input = snakemake.input.umi

if umi_input is None:
    raise ValueError("Missing input file with UMIs")
elif not (isinstance(umi_input, str) or isinstance(umi_input, Namedlist)):
    raise ValueError(
        "Input UMIs-file should be a string or a snakemake io list: "
        + str(umi_input)
        + "!"
    )

if not len(snakemake.output) == 1:
    raise ValueError("Only one output value expected: " + str(snakemake.output) + "!")
output_file = snakemake.output[0]


if output_file is None:
    raise ValueError("Missing output file!")
elif not isinstance(output_file, str):
    raise ValueError("Output bam-file should be a string: " + str(output_file) + "!")

shell(
    "fgbio {java_opts} AnnotateBamWithUmis"
    " -i {bam_input}"
    " -f {umi_input}"
    " -o {output_file}"
    " {extra_params}"
    " {log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
__author__ = "Johannes Köster, Felix Mölder, Christopher Schröder"
__copyright__ = "Copyright 2017, Johannes Köster"
__email__ = "[email protected], [email protected]"
__license__ = "MIT"


from snakemake.shell import shell
from tempfile import TemporaryDirectory
from snakemake_wrapper_utils.bcftools import get_bcftools_opts


log = snakemake.log_fmt_shell(stdout=False, stderr=True)
extra = snakemake.params.get("extra", "")
bcftools_sort_opts = get_bcftools_opts(
    snakemake,
    parse_threads=False,
    parse_ref=False,
    parse_regions=False,
    parse_samples=False,
    parse_targets=False,
    parse_output=False,
    parse_output_format=False,
)


pipe = ""
norm_params = snakemake.params.get("normalize")
if norm_params:
    bcftools_norm_opts = get_bcftools_opts(
        snakemake, parse_regions=False, parse_targets=False, parse_memory=False
    )
    pipe = f"bcftools norm {bcftools_norm_opts} {norm_params}"
else:
    bcftools_view_opts = get_bcftools_opts(
        snakemake,
        parse_ref=False,
        parse_regions=False,
        parse_targets=False,
        parse_memory=False,
    )
    pipe = f"bcftools view {bcftools_view_opts}"


if snakemake.threads == 1:
    freebayes = "freebayes"
else:
    chunksize = snakemake.params.get("chunksize", 100000)
    regions = f"<(fasta_generate_regions.py {snakemake.input.ref}.fai {chunksize})"

    if snakemake.input.get("regions"):
        regions = (
            "<(bedtools intersect -a "
            + r"<(sed 's/:\([0-9]*\)-\([0-9]*\)$/\t\1\t\2/' "
            + f"{regions}) -b {snakemake.input.regions} | "
            + r"sed 's/\t\([0-9]*\)\t\([0-9]*\)$/:\1-\2/')"
        )

    freebayes = f"freebayes-parallel {regions} {snakemake.threads}"


with TemporaryDirectory() as tempdir:
    shell(
        "({freebayes}"
        " --fasta-reference {snakemake.input.ref}"
        " {extra}"
        " {snakemake.input.alns}"
        " | bcftools sort {bcftools_sort_opts} --temp-dir {tempdir}"
        " | {pipe}"
        ") {log}"
    )
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
__author__ = "Christopher Schröder"
__copyright__ = "Copyright 2020, Christopher Schröder"
__email__ = "[email protected]"
__license__ = "MIT"


import tempfile
from snakemake.shell import shell
from snakemake_wrapper_utils.java import get_java_opts

# Extract arguments
extra = snakemake.params.get("extra", "")
reference = snakemake.input.get("ref")
embed_ref = snakemake.params.get("embed_ref", False)
java_opts = get_java_opts(snakemake)

log = snakemake.log_fmt_shell(stdout=True, stderr=True, append=True)

if snakemake.output.bam.endswith(".cram") and embed_ref:
    output = "/dev/stdout"
    pipe_cmd = " | samtools view -h -O cram,embed_ref -T {reference} -o {snakemake.output.bam} -"
else:
    output = snakemake.output.bam
    pipe_cmd = ""

with tempfile.TemporaryDirectory() as tmpdir:
    shell(
        "(gatk --java-options '{java_opts}' ApplyBQSR"
        " --input {snakemake.input.bam}"
        " --bqsr-recal-file {snakemake.input.recal_table}"
        " --reference {reference}"
        " {extra}"
        " --tmp-dir {tmpdir}"
        " --output {output}" + pipe_cmd + ") {log}"
    )
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
__author__ = "William Rowell"
__copyright__ = "Copyright 2020, William Rowell"
__email__ = "[email protected]"
__license__ = "MIT"

import sys
from snakemake.shell import shell

extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

bed = snakemake.input.get("bed", "")
by = snakemake.params.get("by", "")
if by:
    if bed:
        sys.exit(
            "Either provide a bed input file OR a window size via params.by, not both."
        )
    else:
        by = f"--by {by}"
if bed:
    by = f"--by {bed}"

quantize_out = False
thresholds_out = False
regions_bed_out = False
region_dist_out = False
for file in snakemake.output:
    if ".per-base." in file and "--no-per-base" in extra:
        sys.exit(
            "You asked not to generate per-base output (--no-per-base), but your rule specifies a '.per-base.' output file. Remove one of the two."
        )
    if ".quantized.bed.gz" in file:
        quantize_out = True
    if ".thresholds.bed.gz" in file:
        thresholds_out = True
    if ".mosdepth.region.dist.txt" in file:
        region_dist_out = True
    if ".regions.bed.gz" in file:
        regions_bed_out = True


if by and not regions_bed_out:
    sys.exit(
        "You ask for by-region output. Please also specify *.regions.bed.gz as a rule output."
    )

if by and not region_dist_out:
    sys.exit(
        "You ask for by-region output. Please also specify *.mosdepth.region.dist.txt as a rule output."
    )

if (region_dist_out or regions_bed_out) and not by:
    sys.exit(
        "You specify *.regions.bed.gz and/or *.mosdepth.region.dist.txt as a rule output. You also need to ask for by-region output via 'input.bed' or 'params.by'."
    )

quantize = snakemake.params.get("quantize", "")
if quantize:
    if not quantize_out:
        sys.exit(
            "You ask for quantized output via params.quantize. Please also specify *.quantized.bed.gz as a rule output."
        )
    quantize = f"--quantize {quantize}"

if not quantize and quantize_out:
    sys.exit(
        "The rule has output *.quantized.bed.gz specified. Please also specify params.quantize to actually generate it."
    )


thresholds = snakemake.params.get("thresholds", "")
if thresholds:
    if not thresholds_out:
        sys.exit(
            "You ask for --thresholds output via params.thresholds. Please also specify *.thresholds.bed.gz as a rule output."
        )
    thresholds = f"--thresholds {thresholds}"

if not thresholds and thresholds_out:
    sys.exit(
        "The rule has output *.thresholds.bed.gz specified. Please also specify params.thresholds to actually generate it."
    )


precision = snakemake.params.get("precision", "")
if precision:
    precision = f"MOSDEPTH_PRECISION={precision}"


fasta = snakemake.input.get("fasta", "")
if fasta:
    fasta = f"--fasta {fasta}"


# mosdepth takes additional threads through its option --threads
# One thread for mosdepth
# Other threads are *additional* decompression threads passed to the '--threads' argument
threads = "" if snakemake.threads <= 1 else "--threads {}".format(snakemake.threads - 1)


# named output summary = "*.mosdepth.summary.txt" is required
prefix = snakemake.output.summary.replace(".mosdepth.summary.txt", "")


shell(
    "({precision} mosdepth {threads} {fasta} {by} {quantize} {thresholds} {extra} {prefix} {snakemake.input.bam}) {log}"
)
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from os import path

from snakemake.shell import shell


extra = snakemake.params.get("extra", "")
# Set this to False if multiqc should use the actual input directly
# instead of parsing the folders where the provided files are located
use_input_files_only = snakemake.params.get("use_input_files_only", False)

if not use_input_files_only:
    input_data = set(path.dirname(fp) for fp in snakemake.input)
else:
    input_data = set(snakemake.input)

output_dir = path.dirname(snakemake.output[0])
output_name = path.basename(snakemake.output[0])
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

shell(
    "multiqc"
    " {extra}"
    " --force"
    " -o {output_dir}"
    " -n {output_name}"
    " {input_data}"
    " {log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2019, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

import subprocess
import sys
from pathlib import Path
from snakemake.shell import shell


log = snakemake.log_fmt_shell(stdout=False, stderr=True)


species = snakemake.params.species.lower()
build = snakemake.params.build
release = int(snakemake.params.release)
gtf_release = release
out_fmt = Path(snakemake.output[0]).suffixes
out_gz = (out_fmt.pop() and True) if out_fmt[-1] == ".gz" else False
out_fmt = out_fmt.pop().lstrip(".")


branch = ""
if build == "GRCh37":
    if release >= 81:
        # use the special grch37 branch for new releases
        branch = "grch37/"
    if release > 87:
        gtf_release = 87
elif snakemake.params.get("branch"):
    branch = snakemake.params.branch + "/"


flavor = snakemake.params.get("flavor", "")
if flavor:
    flavor += "."


suffix = ""
if out_fmt == "gtf":
    suffix = "gtf.gz"
elif out_fmt == "gff3":
    suffix = "gff3.gz"
else:
    raise ValueError(
        "invalid format specified. Only 'gtf[.gz]' and 'gff3[.gz]' are currently supported."
    )


url = "ftp://ftp.ensembl.org/pub/{branch}release-{release}/{out_fmt}/{species}/{species_cap}.{build}.{gtf_release}.{flavor}{suffix}".format(
    release=release,
    gtf_release=gtf_release,
    build=build,
    species=species,
    out_fmt=out_fmt,
    species_cap=species.capitalize(),
    suffix=suffix,
    flavor=flavor,
    branch=branch,
)


try:
    if out_gz:
        shell("curl -L {url} > {snakemake.output[0]} {log}")
    else:
        shell("(curl -L {url} | gzip -d > {snakemake.output[0]}) {log}")
except subprocess.CalledProcessError as e:
    if snakemake.log:
        sys.stderr = open(snakemake.log[0], "a")
    print(
        "Unable to download annotation data from Ensembl. "
        "Did you check that this combination of species, build, and release is actually provided?",
        file=sys.stderr,
    )
    exit(1)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2019, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

import subprocess as sp
import sys
from itertools import product
from snakemake.shell import shell

species = snakemake.params.species.lower()
release = int(snakemake.params.release)
build = snakemake.params.build

branch = ""
if release >= 81 and build == "GRCh37":
    # use the special grch37 branch for new releases
    branch = "grch37/"
elif snakemake.params.get("branch"):
    branch = snakemake.params.branch + "/"

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

spec = ("{build}" if int(release) > 75 else "{build}.{release}").format(
    build=build, release=release
)

suffixes = ""
datatype = snakemake.params.get("datatype", "")
chromosome = snakemake.params.get("chromosome", "")
if datatype == "dna":
    if chromosome:
        suffixes = ["dna.chromosome.{}.fa.gz".format(chromosome)]
    else:
        suffixes = ["dna.primary_assembly.fa.gz", "dna.toplevel.fa.gz"]
elif datatype == "cdna":
    suffixes = ["cdna.all.fa.gz"]
elif datatype == "cds":
    suffixes = ["cds.all.fa.gz"]
elif datatype == "ncrna":
    suffixes = ["ncrna.fa.gz"]
elif datatype == "pep":
    suffixes = ["pep.all.fa.gz"]
else:
    raise ValueError("invalid datatype, must be one of dna, cdna, cds, ncrna, pep")

if chromosome:
    if not datatype == "dna":
        raise ValueError(
            "invalid datatype, to select a single chromosome the datatype must be dna"
        )

spec = spec.format(build=build, release=release)
url_prefix = f"ftp://ftp.ensembl.org/pub/{branch}release-{release}/fasta/{species}/{datatype}/{species.capitalize()}.{spec}"

success = False
for suffix in suffixes:
    url = f"{url_prefix}.{suffix}"

    try:
        shell("curl -sSf {url} > /dev/null 2> /dev/null")
    except sp.CalledProcessError:
        continue

    shell("(curl -L {url} | gzip -d > {snakemake.output[0]}) {log}")
    success = True
    break

if not success:
    if len(suffixes) > 1:
        url = f"{url_prefix}.[{'|'.join(suffixes)}]"
    else:
        url = f"{url_prefix}.{suffixes[0]}"
    print(
        f"Unable to download requested sequence data from Ensembl ({url}). "
        "Please check whether above URL is currently available (might be a temporal server issue). "
        "Apart from that, did you check that this combination of species, build, and release is actually provided?",
        file=sys.stderr,
    )
    exit(1)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
__author__ = "Michael Chambers"
__copyright__ = "Copyright 2019, Michael Chambers"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell
from snakemake_wrapper_utils.samtools import get_samtools_opts

samtools_opts = get_samtools_opts(
    snakemake, parse_threads=False, parse_write_index=False, parse_output_format=False
)
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

shell("samtools faidx {samtools_opts} {extra} {snakemake.input[0]} {log}")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
__author__ = "Antonie Vietor"
__copyright__ = "Copyright 2020, Antonie Vietor"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell
from snakemake_wrapper_utils.samtools import get_samtools_opts

samtools_opts = get_samtools_opts(
    snakemake, parse_write_index=False, parse_output=False, parse_output_format=False
)
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)

shell(
    "samtools idxstats {samtools_opts} {extra} {snakemake.input.bam} > {snakemake.output[0]} {log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell

extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

# Samtools takes additional threads through its option -@
# One thread for samtools merge
# Other threads are *additional* threads passed to the '-@' argument
threads = "" if snakemake.threads <= 1 else " -@ {} ".format(snakemake.threads - 1)

shell(
    "samtools index {threads} {extra} {snakemake.input[0]} {snakemake.output[0]} {log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell
from snakemake_wrapper_utils.samtools import get_samtools_opts

samtools_opts = get_samtools_opts(snakemake)
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

shell("samtools merge {samtools_opts} {extra} {snakemake.input} {log}")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


import tempfile
from pathlib import Path
from snakemake.shell import shell
from snakemake_wrapper_utils.snakemake import get_mem
from snakemake_wrapper_utils.samtools import get_samtools_opts


samtools_opts = get_samtools_opts(snakemake)
extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

mem_per_thread_mb = int(get_mem(snakemake) / snakemake.threads)

with tempfile.TemporaryDirectory() as tmpdir:
    tmp_prefix = Path(tmpdir) / "samtools_sort"

    shell(
        "samtools sort {samtools_opts} -m {mem_per_thread_mb}M {extra} -T {tmp_prefix} {snakemake.input[0]} {log}"
    )
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell
from snakemake_wrapper_utils.samtools import get_samtools_opts


bed = snakemake.input.get("bed", "")
if bed:
    bed = f"-t {bed}"

samtools_opts = get_samtools_opts(
    snakemake, parse_write_index=False, parse_output=False, parse_output_format=False
)


extra = snakemake.params.get("extra", "")
region = snakemake.params.get("region", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)


shell(
    "samtools stats {samtools_opts} {extra} {snakemake.input[0]} {bed} {region} > {snakemake.output[0]} {log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell
from snakemake_wrapper_utils.samtools import get_samtools_opts

samtools_opts = get_samtools_opts(snakemake)
extra = snakemake.params.get("extra", "")
region = snakemake.params.get("region", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True, append=True)


shell("samtools view {samtools_opts} {extra} {snakemake.input[0]} {region} {log}")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
__author__ = "Johannes Köster, Derek Croote"
__copyright__ = "Copyright 2020, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

import os
import tempfile
from snakemake.shell import shell
from snakemake_wrapper_utils.snakemake import get_mem


log = snakemake.log_fmt_shell(stdout=True, stderr=True)
extra = snakemake.params.get("extra", "")


# Parse memory
mem_mb = get_mem(snakemake, "MiB")


# Outdir
outdir = os.path.dirname(snakemake.output[0])
if outdir:
    outdir = f"--outdir {outdir}"


# Output compression
compress = ""
mem = f"-m{mem_mb}" if mem_mb else ""

for output in snakemake.output:
    out_name, out_ext = os.path.splitext(output)
    if out_ext == ".gz":
        compress += f"pigz -p {snakemake.threads} {out_name}; "
    elif out_ext == ".bz2":
        compress += f"pbzip2 -p{snakemake.threads} {mem} {out_name}; "


with tempfile.TemporaryDirectory() as tmpdir:
    mem = f"--mem {mem_mb}M" if mem_mb else ""

    shell(
        "(fasterq-dump --temp {tmpdir} --threads {snakemake.threads} {mem} "
        "{extra} {outdir} {snakemake.wildcards.accession}; "
        "{compress}"
        ") {log}"
    )
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

shell("tabix {snakemake.params} {snakemake.input[0]} {log}")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2023, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

import tempfile
from pathlib import Path
from snakemake.shell import shell


extra = snakemake.params.get("extra", "")

try:
    release = int(snakemake.params.release)
except ValueError:
    raise ValueError("The parameter release is supposed to be an integer.")

with tempfile.TemporaryDirectory() as tmpdir:
    # We download the cache tarball manually because vep_install does not consider proxy settings (in contrast to curl).
    # See https://github.com/bcbio/bcbio-nextgen/issues/1080
    vep_dir = "vep" if release >= 97 else "VEP"
    cache_tarball = (
        f"{snakemake.params.species}_vep_{release}_{snakemake.params.build}.tar.gz"
    )
    log = snakemake.log_fmt_shell(stdout=True, stderr=True)
    shell(
        "curl -L ftp://ftp.ensembl.org/pub/release-{snakemake.params.release}/"
        "variation/{vep_dir}/{cache_tarball} "
        "-o {tmpdir}/{cache_tarball} {log}"
    )

    log = snakemake.log_fmt_shell(stdout=True, stderr=True, append=True)
    shell(
        "vep_install --AUTO c "
        "--SPECIES {snakemake.params.species} "
        "--ASSEMBLY {snakemake.params.build} "
        "--CACHE_VERSION {release} "
        "--CACHEURL {tmpdir} "
        "--CACHEDIR {snakemake.output} "
        "--CONVERT "
        "--NO_UPDATE "
        "{extra} {log}"
    )
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2020, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

import sys
from pathlib import Path
from urllib.request import urlretrieve
from zipfile import ZipFile
from tempfile import NamedTemporaryFile

if snakemake.log:
    sys.stderr = open(snakemake.log[0], "w")

outdir = Path(snakemake.output[0])
outdir.mkdir()

with NamedTemporaryFile() as tmp:
    urlretrieve(
        "https://github.com/Ensembl/VEP_plugins/archive/release/{release}.zip".format(
            release=snakemake.params.release
        ),
        tmp.name,
    )

    with ZipFile(tmp.name) as f:
        for member in f.infolist():
            memberpath = Path(member.filename)
            if len(memberpath.parts) == 1:
                # skip root dir
                continue
            targetpath = outdir / memberpath.relative_to(memberpath.parts[0])
            if member.is_dir():
                targetpath.mkdir()
            else:
                with open(targetpath, "wb") as out:
                    out.write(f.read(member.filename))
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
__author__ = "Johannes Köster, Christopher Schröder"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


import tempfile
from pathlib import Path
from snakemake.shell import shell
from snakemake_wrapper_utils.java import get_java_opts
from snakemake_wrapper_utils.samtools import get_samtools_opts, infer_out_format


log = snakemake.log_fmt_shell()
extra = snakemake.params.get("extra", "")
# the --SORTING_COLLECTION_SIZE_RATIO default of 0.25 might
# indicate a JVM memory overhead of that proportion
java_opts = get_java_opts(snakemake, java_mem_overhead_factor=0.3)
samtools_opts = get_samtools_opts(snakemake)


tool = "MarkDuplicates"
if snakemake.params.get("withmatecigar", False):
    tool = "MarkDuplicatesWithMateCigar"


bams = snakemake.input.bams
if isinstance(bams, str):
    bams = [bams]
bams = list(map("--INPUT {}".format, bams))


output = snakemake.output.bam
output_fmt = infer_out_format(output)
convert = ""
if output_fmt == "CRAM":
    output = "/dev/stdout"

    # NOTE: output format inference should be done by snakemake-wrapper-utils. Keeping it here for backwards compatibility.
    if snakemake.params.get("embed_ref", False):
        samtools_opts += ",embed_ref"

    convert = f" | samtools view {samtools_opts}"
elif output_fmt == "BAM" and snakemake.output.get("idx"):
    extra += " --CREATE_INDEX"


with tempfile.TemporaryDirectory() as tmpdir:
    shell(
        "(picard {tool}"  # Tool and its subcommand
        " {java_opts}"  # Automatic java option
        " {extra}"  # User defined parmeters
        " {bams}"  # Input bam(s)
        " --TMP_DIR {tmpdir}"
        " --OUTPUT {output}"  # Output bam
        " --METRICS_FILE {snakemake.output.metrics}"  # Output metrics
        " {convert}) {log}"  # Logging
    )


output_prefix = Path(snakemake.output.bam).with_suffix("")
if snakemake.output.get("idx"):
    if output_fmt == "BAM" and snakemake.output.idx != str(output_prefix) + ".bai":
        shell("mv {output_prefix}.bai {snakemake.output.idx}")
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2019, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

import tempfile
import subprocess
import sys
import os
from snakemake.shell import shell
from snakemake.exceptions import WorkflowError

species = snakemake.params.species.lower()
release = int(snakemake.params.release)
build = snakemake.params.build
type = snakemake.params.type
chromosome = snakemake.params.get("chromosome", "")


branch = ""
if release >= 81 and build == "GRCh37":
    # use the special grch37 branch for new releases
    branch = "grch37/"
elif snakemake.params.get("branch"):
    branch = snakemake.params.branch + "/"

if release < 98 and not branch:
    print("Ensembl releases <98 are unsupported.", file=open(snakemake.log[0], "w"))
    exit(1)

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

if chromosome and type != "all":
    raise ValueError(
        "Parameter chromosome given but chromosome-wise download"
        "is only implemented for type='all'."
    )

if type == "all":
    if species == "homo_sapiens" and release >= 93:
        chroms = (
            list(range(1, 23)) + ["X", "Y", "MT"] if not chromosome else [chromosome]
        )
        suffixes = ["-chr{}".format(chrom) for chrom in chroms]
    else:
        if chromosome:
            raise ValueError(
                "Parameter chromosome given but chromosome-wise download"
                "is only implemented for homo_sapiens in releases >=93."
            )
        suffixes = [""]
elif type == "somatic":
    suffixes = ["_somatic"]
elif type == "structural_variations":
    suffixes = ["_structural_variations"]
else:
    raise ValueError(
        "Unsupported type {} (only all, somatic, structural_variations are allowed)".format(
            type
        )
    )

species_filename = species if release >= 91 else species.capitalize()

urls = [
    "ftp://ftp.ensembl.org/pub/{branch}release-{release}/variation/vcf/{species}/{species_filename}{suffix}.vcf.gz".format(
        release=release,
        species=species,
        suffix=suffix,
        species_filename=species_filename,
        branch=branch,
    )
    for suffix in suffixes
]
names = [os.path.basename(url) for url in urls]

try:
    gather = "curl {urls}".format(urls=" ".join(map("-O {}".format, urls)))
    workdir = os.getcwd()
    with tempfile.TemporaryDirectory() as tmpdir:
        if snakemake.input.get("fai"):
            shell(
                "(cd {tmpdir}; {gather} && "
                "bcftools concat -Oz --naive {names} > concat.vcf.gz && "
                "bcftools reheader --fai {workdir}/{snakemake.input.fai} concat.vcf.gz "
                "> {workdir}/{snakemake.output}) {log}"
            )
        else:
            shell(
                "(cd {tmpdir}; {gather} && "
                "bcftools concat -Oz --naive {names} "
                "> {workdir}/{snakemake.output}) {log}"
            )
except subprocess.CalledProcessError as e:
    if snakemake.log:
        sys.stderr = open(snakemake.log[0], "a")
    print(
        "Unable to download variation data from Ensembl. "
        "Did you check that this combination of species, build, and release is actually provided? ",
        file=sys.stderr,
    )
    exit(1)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2020, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

import os
from pathlib import Path
from snakemake.shell import shell


def get_only_child_dir(path):
    children = [child for child in path.iterdir() if child.is_dir()]
    assert (
        len(children) == 1
    ), "Invalid VEP cache directory, only a single entry is allowed, make sure that cache was created with the snakemake VEP cache wrapper"
    return children[0]


extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)

fork = "--fork {}".format(snakemake.threads) if snakemake.threads > 1 else ""
stats = snakemake.output.stats
cache = snakemake.input.get("cache", "")
plugins = snakemake.input.plugins
plugin_aux_files = {"LoFtool": "LoFtool_scores.txt", "ExACpLI": "ExACpLI_values.txt"}

load_plugins = []
for plugin in snakemake.params.plugins:
    if plugin in plugin_aux_files.keys():
        aux_path = os.path.join(plugins, plugin_aux_files[plugin])
        load_plugins.append(",".join([plugin, aux_path]))
    else:
        load_plugins.append(",".join([plugin, snakemake.input.get(plugin.lower(), "")]))
load_plugins = " ".join(map("--plugin {}".format, load_plugins))

if snakemake.output.calls.endswith(".vcf.gz"):
    fmt = "z"
elif snakemake.output.calls.endswith(".bcf"):
    fmt = "b"
else:
    fmt = "v"

fasta = snakemake.input.get("fasta", "")
if fasta:
    fasta = "--fasta {}".format(fasta)

gff = snakemake.input.get("gff", "")
if gff:
    gff = "--gff {}".format(gff)

if cache:
    entrypath = get_only_child_dir(get_only_child_dir(Path(cache)))
    species = (
        entrypath.parent.name[:-7]
        if entrypath.parent.name.endswith("_refseq")
        else entrypath.parent.name
    )
    release, build = entrypath.name.split("_")
    cache = (
        "--offline --cache --dir_cache {cache} --cache_version {release} --species {species} --assembly {build}"
    ).format(cache=cache, release=release, build=build, species=species)

shell(
    "(bcftools view '{snakemake.input.calls}' | "
    "vep {extra} {fork} "
    "--format vcf "
    "--vcf "
    "{cache} "
    "{gff} "
    "{fasta} "
    "--dir_plugins {plugins} "
    "{load_plugins} "
    "--output_file STDOUT "
    "--stats_file {stats} | "
    "bcftools view -O{fmt} > {snakemake.output.calls}) {log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2017, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=True, stderr=True)

extra = snakemake.params.get("extra", "")

shell("datavzrd {snakemake.input.config} {extra} --output {snakemake.output[0]} {log}")
21
22
wrapper:
    "v2.5.0/bio/vep/annotate"
47
48
wrapper:
    "v2.5.0/bio/vep/annotate"
66
67
shell:
    "(bcftools view --threads {threads} {input.bcf} {params.pipes} | bcftools view --threads {threads} -Ob > {output}) 2> {log}"
83
84
shell:
    "rbt vcf-annotate-dgidb {input} {params.datasources} > {output} 2> {log}"
97
98
wrapper:
    "v2.3.2/bio/bcftools/concat"
14
15
wrapper:
    "v2.3.2/bio/bcftools/concat"
24
25
wrapper:
    "v2.3.2/bio/benchmark/chm-eval-sample"
38
39
wrapper:
    "v2.3.2/bio/samtools/sort"
52
53
shell:
    "samtools fastq {input} -1 {output.fq1} -2 {output.fq2} 2> {log}"
66
67
wrapper:
    "v2.3.2/bio/benchmark/chm-eval-kit"
79
80
shell:
    "awk '{{ print $1,\"chr\"$1 }}' OFS='\t' {input} > {output} 2> {log}"
95
96
97
shell:
    "(bcftools annotate --rename-chrs {input.map} {input} | "
    "bcftools view --targets {params.targets} > {output}) 2> {log} "
112
113
wrapper:
    "v2.3.2/bio/benchmark/chm-eval"
36
37
shell:
    "varlociraptor estimate alignment-properties {input.ref} --bam {input.bam} > {output} 2> {log}"
58
59
60
61
shell:
    "varlociraptor preprocess variants --candidates {input.candidates} {params.extra} "
    "--alignment-properties {input.alignment_props} {input.ref} --bam {input.bam} --output {output} "
    "2> {log}"
82
83
84
shell:
    "(varlociraptor call variants {params.extra} generic --obs {params.obs}"
    " --scenario {input.scenario} {params.postprocess} {output}) 2> {log}"
 98
 99
100
shell:
    "bcftools sort --max-mem {resources.mem_mb}M --temp-dir `mktemp -d` "
    "-Ob {input} > {output} 2> {log}"
113
114
wrapper:
    "v2.3.2/bio/bcftools/concat"
21
22
wrapper:
    "v2.3.2/bio/freebayes"
39
40
wrapper:
    "v2.3.2/bio/delly"
53
54
shell:
    """bcftools view -e 'INFO/SVTYPE="BND"' {input} -Ob > {output} 2> {log}"""
68
69
wrapper:
    "v2.3.2/bio/bcftools/filter"
85
86
shell:
    "rbt vcf-split {input} {output}"
15
16
script:
    "../scripts/split-call-tables.py"
38
39
script:
    "../scripts/oncoprint.py"
103
104
wrapper:
    "v2.6.0/utils/datavzrd"
14
15
16
shell:
    "(bcftools norm -Ou --do-not-normalize --multiallelics -any {input} | "
    'vembrane filter {params.aux} "{params.filter}" --output-fmt bcf --output {output}) &> {log}'
32
33
shell:
    'vembrane filter {params.aux} "{params.filter}" {input.bcf} --output-fmt bcf --output {output} &> {log}'
49
50
shell:
    "varlociraptor filter-calls posterior-odds --events {params.events} --odds barely < {input} > {output} 2> {log}"
63
64
wrapper:
    "v2.3.2/bio/bcftools/concat"
78
79
80
shell:
    "varlociraptor filter-calls control-fdr {input} {params.query[mode]} --var {wildcards.vartype} "
    "--events {params.query[events]} --fdr {params.query[threshold]} > {output} 2> {log}"
93
94
wrapper:
    "v2.3.2/bio/bcftools/concat"
106
107
shell:
    "varlociraptor decode-phred < {input} > {output} 2> {log}"
14
15
wrapper:
    "v2.3.2/bio/bwa/mem"
27
28
shell:
    "cat {input} > {output} 2> {log}"
43
44
wrapper:
    "v2.3.2/bio/fgbio/annotatebamwithumis"
62
63
wrapper:
    "v2.5.0/bio/picard/markduplicates"
78
79
shell:
    "rbt collapse-reads-to-fragments bam {input} {output} &> {log}"
98
99
wrapper:
    "v2.3.2/bio/bwa/mem"
112
113
wrapper:
    "v2.3.2/bio/samtools/merge"
124
125
wrapper:
    "v2.3.2/bio/samtools/sort"
147
148
wrapper:
    "v1.25.0/bio/gatk/baserecalibratorspark"
170
171
wrapper:
    "v2.3.2/bio/gatk/applybqsr"
13
14
15
16
shell:
    "( bedtools intersect -a {input.covered} -b {input.coding} | "
    "  awk '{{ covered += $3 - $2 }} END {{ print covered }}' >{output} "
    ") 2> {log} "
36
37
38
39
40
41
42
shell:
    "(varlociraptor estimate mutational-burden "
    "--plot-mode {wildcards.mode} "
    "--coding-genome-size $( cat {input.coverage_breadth} ) "
    "--events {params.events} "
    "--sample {wildcards.alias} "
    "< {input.calls} | vl2svg > {output}) 2> {log}"
8
9
shell:
    "curl https://rothsj06.dmz.hpc.mssm.edu/revel-v1.3_all_chromosomes.zip -o {output} &> {log}"
23
24
25
26
27
28
29
30
31
32
33
34
35
shell:
    """
    tmpfile=$(mktemp {resources.tmpdir}/revel_scores.XXXXXX)
    unzip -p {input} | tr "," "\t" | sed '1s/.*/#&/' | bgzip -c > $tmpfile
    if [ "{params.build}" == "GRCh38" ] ; then
        zgrep -h -v ^#chr $tmpfile | awk '$3 != "." ' | sort -k1,1 -k3,3n - | cat <(zcat $tmpfile | head -n1) - | bgzip -c > {output}
    elif [ "{params.build}" == "GRCh37" ] ; then
        cat $tmpfile > {output}
    else
        echo "Annotation of REVEL scores only supported for GRCh37 or GRCh38" > {log}
        exit 125
    fi
    """
12
13
shell:
    "fgbio AssignPrimers -i {input.bam} -p {input.primers} -m {output.metric} -o {output.assigned} &> {log}"
26
27
script:
    "../scripts/filter_primers.rs"
43
44
shell:
    "fgbio TrimPrimers -H -i {input.bam} -p {input.primers} -s {params.sort_order} {params.single_primer} -o {output.trimmed} &> {log}"
59
60
61
shell:
    "mkdir {output} & "
    "bowtie-build {input} {params.prefix} &> {log}"
79
80
shell:
    "bowtie {params.primers} -x {params.prefix} {params.insertsize} {params.primer_format} -S | samtools view -b - > {output} 2> {log}"
92
93
wrapper:
    "v2.3.2/bio/samtools/view"
109
110
shell:
    "samtools sort -n {input} | bamToBed -i - {params.format} > {output} 2> {log}"
122
123
script:
    "../scripts/build_primer_regions.py"
 9
10
wrapper:
    "v2.3.2/bio/fastqc"
21
22
wrapper:
    "v2.3.2/bio/samtools/idxstats"
SnakeMake From line 21 of rules/qc.smk
32
33
wrapper:
    "v2.3.2/bio/samtools/stats"
SnakeMake From line 32 of rules/qc.smk
50
51
wrapper:
    "v2.3.2/bio/multiqc"
13
14
wrapper:
    "v2.3.2/bio/reference/ensembl-sequence"
SnakeMake From line 13 of rules/ref.smk
25
26
wrapper:
    "v2.3.2/bio/samtools/faidx"
SnakeMake From line 25 of rules/ref.smk
39
40
shell:
    "samtools dict {input} > {output} 2> {log} "
58
59
wrapper:
    "v2.5.0/bio/reference/ensembl-variation"
SnakeMake From line 58 of rules/ref.smk
73
74
wrapper:
    "v2.3.2/bio/reference/ensembl-annotation"
SnakeMake From line 73 of rules/ref.smk
110
111
shell:
    "(rbt vcf-fix-iupac-alleles < {input} | bcftools view -Oz > {output}) 2> {log}"
124
125
wrapper:
    "v2.3.2/bio/bwa/index"
SnakeMake From line 124 of rules/ref.smk
138
139
wrapper:
    "v2.3.2/bio/vep/cache"
SnakeMake From line 138 of rules/ref.smk
149
150
wrapper:
    "v2.3.2/bio/vep/plugins"
SnakeMake From line 149 of rules/ref.smk
14
15
16
17
18
shell:
    """
    (cat {input} | sort -k1,1 -k2,2n - | mergeBed -i - | awk \'{{sub("^chr","", $0); print}}\' > {output} \
    && if [[ ! -s {output} ]]; then >&2 echo 'Empty output: target file appears to be invalid'; exit 1; fi) 2> {log}
    """
34
35
wrapper:
    "v2.3.2/bio/mosdepth"
50
51
shell:
    "zcat {input} | sort -k1,1 -k2,2n - | mergeBed -i - -d 15000 > {output} 2> {log}"
66
67
shell:
    "zcat {input} | sort -k1,1 -k2,2n - | mergeBed -i - > {output} 2> {log}"
86
87
88
89
shell:
    "cat {input.regions} | grep -f <(head -n {params.chroms} {input.fai} | "
    'awk \'{{print "^"$1"\\t"}}\') {params.filter_targets} | sort -k1,1 -k2,2n '
    "> {output} 2> {log}"
 99
100
shell:
    "curl {params.url} -o {output} &> {log}"
13
14
15
shell:
    'vembrane table --header "{params.config[header]}" "{params.config[expr]}" '
    "{input.bcf} > {output.bcf} 2> {log}"
27
28
script:
    "../scripts/tsv_to_xlsx.py"
16
17
wrapper:
    "v2.3.2/bio/bcftools/concat"
33
34
35
36
37
shell:
    "varlociraptor "
    "call variants --testcase-prefix {output} --testcase-locus {wildcards.locus} "
    "{params.extra} generic --obs {params.obs} "
    "--scenario {input.scenario} 2> {log}"
7
8
wrapper:
    "v2.3.2/bio/sra-tools/fasterq-dump"
21
22
shell:
    "cat {input} > {output} 2> {log}"
38
39
wrapper:
    "v2.3.2/bio/cutadapt/pe"
54
55
wrapper:
    "v2.3.2/bio/cutadapt/se"
67
68
shell:
    "cat {input} > {output} 2> {log}"
10
11
shell:
    "bcftools index {input} 2> {log}"
21
22
wrapper:
    "v2.3.2/bio/bcftools/view"
32
33
wrapper:
    "v2.3.2/bio/samtools/index"
46
47
wrapper:
    "v2.3.2/bio/tabix/index"
10
11
shell:
    "vl2svg {input} {output} 2> {log}"
SnakeMake From line 10 of rules/vega.smk
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import pandas as pd


def parse_bed(log_file, out):
    print("chrom\tleft_start\tleft_end\tright_start\tright_end", file=out)
    for data_primers in pd.read_csv(
        snakemake.input[0],
        sep="\t",
        header=None,
        chunksize=chunksize,
        usecols=[0, 1, 2, 5],
    ):
        for row in data_primers.iterrows():
            row_id = row[0]
            row = row[1]
            if row[5] == "+":
                print(
                    "{chrom}\t{start}\t{end}\t-1\t-1".format(
                        chrom=row[0], start=row[1] + 1, end=row[2]
                    ),
                    file=out,
                )
            elif row[5] == "-":
                print(
                    "{chrom}\t-1\t-1\t{start}\t{end}".format(
                        chrom=row[0], start=row[1] + 1, end=row[2]
                    ),
                    file=out,
                )
            else:
                print("Invalid strand in row {}".format(row_id), file=log_file)


def parse_bedpe(log_file, out):
    for data_primers in pd.read_csv(
        snakemake.input[0],
        sep="\t",
        header=None,
        chunksize=chunksize,
        usecols=[0, 1, 2, 3, 4, 5],
    ):
        valid_primers = data_primers[0] == data_primers[3]
        valid_data = data_primers[valid_primers].copy()
        valid_data.iloc[:, [1, 4]] += 1
        valid_data.drop(columns=[3], inplace=True)
        valid_data.dropna(how="all", inplace=True)
        valid_data.to_csv(
            out,
            sep="\t",
            index=False,
            header=["chrom", "left_start", "left_end", "right_start", "right_end"],
        )
        print(
            data_primers[~valid_primers].to_csv(sep="\t", index=False, header=False),
            file=log_file,
        )


chunksize = 10**6
with open(snakemake.output[0], "w") as out:
    with open(snakemake.log[0], "w") as log_file:
        if snakemake.input[0].endswith("bedpe"):
            parse_bedpe(log_file, out)
        else:
            parse_bed(log_file, out)
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
import sys

sys.stderr = open(snakemake.log[0], "w")

import os
from pathlib import Path

import pandas as pd
import numpy as np

from sklearn.feature_selection import chi2
from statsmodels.stats.multitest import fdrcorrection


def join_group_hgvsgs(df):
    hgvsgs = ",".join(np.sort(pd.unique(df["hgvsg"].str.split(",").explode())))
    df.loc[:, "hgvsg"] = hgvsgs
    return df.drop_duplicates()


def join_group_consequences(df):
    consequences = ",".join(
        np.sort(pd.unique(df["consequence"].str.split(",").explode()))
    )
    df.loc[:, "consequence"] = consequences
    return df


def join_gene_vartypes(df):
    vartypes = ",".join(pd.unique(df["vartype"]))
    df = df.iloc[:1]
    df.loc[:, "vartype"] = vartypes
    return df


def load_calls(path, group):
    calls = pd.read_csv(
        path, sep="\t", usecols=["symbol", "vartype", "hgvsp", "hgvsg", "consequence"]
    )
    calls["group"] = group
    calls.loc[:, "consequence"] = calls["consequence"].str.replace("&", ",")
    return calls.drop_duplicates()


def load_group_annotation():
    if snakemake.input.group_annotation:
        group_annotation = (
            pd.read_csv(
                snakemake.input.group_annotation, sep="\t", dtype={"group": str}
            )
            .set_index("group")
            .sort_index()
        )
        group_annotation = group_annotation.loc[snakemake.params.groups]
    else:
        group_annotation = pd.DataFrame({"group": snakemake.params.groups}).set_index(
            "group"
        )
    group_annotation = group_annotation.T
    group_annotation.index.name = "annotation"
    return group_annotation


def sort_by_recurrence(matrix, no_occurence_check_func):
    matrix["nocalls"] = no_occurence_check_func(matrix).sum(axis="columns")
    matrix = matrix.sort_values("nocalls", ascending=True).drop(
        labels=["nocalls"], axis="columns"
    )
    return matrix


def add_missing_groups(matrix, groups, index_mate):
    for group in groups:
        if (index_mate, group) not in matrix.columns:
            matrix[(index_mate, group)] = pd.NA
    return matrix


def attach_group_annotation(matrix, group_annotation):
    index_cols = matrix.index.names
    return (
        pd.concat([group_annotation.reset_index(drop=True), matrix.reset_index()])
        .set_index(index_cols)
        .reset_index()
    )


def gene_oncoprint(calls):
    calls = calls[["group", "symbol", "vartype", "consequence"]]
    if not calls.empty:
        grouped = (
            calls.drop_duplicates().groupby(["symbol"]).apply(join_group_consequences)
        ).reset_index(drop=True)
        grouped = (
            grouped.drop_duplicates()
            .groupby(["group", "symbol"])
            .apply(join_gene_vartypes)
        )
        matrix = grouped.set_index(["symbol", "consequence", "group"]).unstack(
            level="group"
        )
        matrix = add_missing_groups(matrix, snakemake.params.groups, "vartype")
        matrix.columns = matrix.columns.droplevel(0)  # remove superfluous header
        if len(matrix.columns) > 1:
            # sort by recurrence
            matrix = sort_by_recurrence(matrix, lambda matrix: matrix.isna())

        return matrix
    else:
        cols = ["symbol", "consequence"] + list(snakemake.params.groups)
        return pd.DataFrame({col: [] for col in cols}).set_index(
            list(snakemake.params.groups)
        )


def variant_oncoprint(gene_calls, group_annotation):
    gene_calls = gene_calls[["group", "hgvsp", "hgvsg", "consequence"]]
    gene_calls.loc[:, "exists"] = "X"
    grouped = gene_calls.drop_duplicates().groupby(["hgvsp"]).apply(join_group_hgvsgs)
    matrix = grouped.set_index(["hgvsp", "hgvsg", "consequence", "group"]).unstack(
        level="group"
    )

    matrix = add_missing_groups(matrix, snakemake.params.groups, "exists")
    matrix.columns = matrix.columns.droplevel(0)  # remove superfluous header

    if len(matrix.columns) > 1:
        # sort by recurrence
        matrix = sort_by_recurrence(matrix, lambda matrix: matrix.isna())

    matrix = attach_group_annotation(matrix, group_annotation)

    return matrix


def store(data, output, labels_df, label_idx=None):
    _labels_df = labels_df
    if label_idx is not None:
        _labels_df = labels_df.iloc[[label_idx]]

    # add labels
    index_cols = data.index.names
    cols = data.columns
    data = pd.concat([_labels_df, data.reset_index()]).set_index(index_cols)
    # restore column order
    data = data[cols]

    data.to_csv(output, sep="\t")


def sort_oncoprint_labels(data):
    labels_df = snakemake.params.labels
    labels = labels_df.index

    for label_idx, label in enumerate(labels):
        outdata = data
        if not data.empty:
            feature_matrix = data.reset_index(drop=True).T.copy()
            feature_matrix[~pd.isna(feature_matrix)] = True
            feature_matrix[pd.isna(feature_matrix)] = False

            # target vector: label values, converted into factors
            target_vector = labels_df.loc[label]
            # ignore any NA in the target vector and correspondingly remove the rows in the feature matrix
            not_na_target_vector = target_vector[~pd.isna(target_vector)].astype("category")
            feature_matrix = feature_matrix.loc[not_na_target_vector.index]

            # calculate mutual information for 100 times and take the mean for each feature
            _, pvals = chi2(feature_matrix, not_na_target_vector)
            sorted_idx = np.argsort(pvals)

            _, fdr = fdrcorrection(pvals)

            # clone data
            sorted_data = data.copy(deep=True)

            # sort by label
            sorted_target_vector = target_vector.sort_values()
            sorted_data = sorted_data[sorted_target_vector.index]

            # add mutual information
            sorted_data.insert(0, "FDR dependency", np.around(fdr, 3))
            sorted_data.insert(0, "p-value dependency", np.around(pvals, 3))

            outdata = sorted_data.iloc[sorted_idx]
        outpath = os.path.join(snakemake.output.gene_oncoprint_sortings, f"{label}.tsv")
        store(outdata, outpath, labels_df, label_idx=label_idx)


calls = pd.concat(
    [
        load_calls(path, sample)
        for path, sample in zip(snakemake.input.calls, snakemake.params.groups)
    ]
)


gene_oncoprint = gene_oncoprint(calls)

group_annotation = load_group_annotation()
gene_oncoprint_main = attach_group_annotation(gene_oncoprint, group_annotation)
gene_oncoprint_main.to_csv(snakemake.output.gene_oncoprint, sep="\t", index=False)

os.makedirs(snakemake.output.gene_oncoprint_sortings)

sort_oncoprint_labels(gene_oncoprint)


os.makedirs(snakemake.output.variant_oncoprints)
for gene, gene_calls in calls.groupby("symbol"):
    variant_oncoprint(gene_calls, group_annotation).to_csv(
        Path(snakemake.output.variant_oncoprints) / f"{gene}.tsv", sep="\t", index=False
    )
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
import sys

sys.stderr = open(snakemake.log[0], "w")

import json
import re

import numpy as np
import pandas as pd


PROB_EPSILON = 0.01  # columns with all probabilities below will be dropped


def write(df, path):
    df = df.drop(["mane_plus_clinical"], axis="columns", errors="ignore")
    df = df.drop(["canonical"], axis="columns", errors="ignore")
    if not df.empty:
        remaining_columns = df.dropna(how="all", axis="columns").columns.tolist()
        if path == snakemake.output.coding:
            # ensure that these columns are kept, even if they contain only NAs in a coding setting
            remaining_columns.extend(["revel", "hgvsp", "symbol"])
            remaining_columns = [col for col in df.columns if col in remaining_columns]
        df = df[remaining_columns]
    df.to_csv(path, index=False, sep="\t")


def format_floats(df):
    for col_name in df:
        if issubclass(df[col_name].dtype.type, np.floating):
            df[col_name] = [
                "{:.2e}".format(x) if x < 0.1 and x > 0 else round(x, 2)
                for x in df[col_name]
            ]
    return df


def drop_cols_by_predicate(df, columns, predicate):
    predicate_true_cols = [
        col for col in columns if predicate(df[col].astype(float)).all()
    ]
    return df.drop(labels=predicate_true_cols, axis="columns")


def drop_low_prob_cols(df):
    return drop_cols_by_predicate(
        df,
        df.columns[df.columns.str.startswith("prob: ")],
        lambda probs: probs <= PROB_EPSILON,
    )


def get_vaf_columns(df):
    return df.columns[df.columns.str.endswith(": allele frequency")]


def get_samples(df):
    return list(get_vaf_columns(df).str.replace(": allele frequency", ""))


def get_vartype(rec):
    ref_allele = rec["reference allele"]
    alt_allele = rec["alternative allele"]
    if alt_allele == "<DEL>" or (len(ref_allele) > 1 and len(alt_allele) == 1):
        return "deletion"
    elif alt_allele == "<INS>" or (len(alt_allele) > 1 and len(ref_allele) == 1):
        return "insertion"
    elif alt_allele == "<INV>":
        return "inversion"
    elif alt_allele == "<DUP>":
        return "duplication"
    elif alt_allele == "<TDUP>":
        return "tandem duplication"
    elif alt_allele == "<CNV>":
        return "copy number variation"
    elif alt_allele.startswith("<"):
        # catch other special alleles
        return "complex"
    elif len(alt_allele) == 1 and len(ref_allele) == 1:
        return "snv"
    elif len(alt_allele) == len(ref_allele):
        return "mnv"
    elif len(alt_allele) > 1 and len(ref_allele) > 1:
        return "replacement"
    else:
        return "breakend"


def order_impact(df):
    order_impact = ["MODIFIER", "LOW", "MODERATE", "HIGH"]
    df["impact"] = pd.Categorical(df["impact"], order_impact)


def sort_calls(df):
    df.sort_values(snakemake.params.sorting, ascending=False, inplace=True)


def reorder_prob_cols(df):
    prob_df = df.filter(regex="^prob: ")
    if not prob_df.empty:
        prob_sums = prob_df.sum().sort_values(ascending=False)
        ordered_columns = prob_sums.keys()
        indexes = [df.columns.get_loc(col) for col in ordered_columns]
        start_index = min(indexes)
        updated_columns = df.columns.drop(labels=ordered_columns)
        for i, col in enumerate(ordered_columns):
            updated_columns = updated_columns.insert(start_index + i, col)
        df = df.reindex(columns=updated_columns)
    return df


def reorder_vaf_cols(df):
    split_index = df.columns.get_loc("consequence")
    left_columns = df.iloc[:, 0:split_index].columns
    vaf_columns = df.filter(regex=": allele frequency$").columns
    right_columns = df.iloc[:, split_index:].columns.drop(vaf_columns)
    reordered_columns = left_columns.append([vaf_columns, right_columns])
    return df[reordered_columns]


def cleanup_dataframe(df):
    df = drop_low_prob_cols(df)
    df = reorder_prob_cols(df)
    df = reorder_vaf_cols(df)
    df = format_floats(df)
    return df


def join_short_obs(df, samples):
    for sample in samples:
        sobs_loc = df.columns.get_loc(f"{sample}: observations")
        df.insert(
            sobs_loc,
            f"{sample}: short observations",
            df[f"{sample}: short ref observations"]
            + ","
            + df[f"{sample}: short alt observations"],
        )
    df = df.drop(
        df.columns[
            df.columns.str.endswith(": short ref observations")
            | df.columns.str.endswith(": short alt observations")
        ],
        axis=1,
    )
    return df


def bin_max_vaf(df, samples):
    af_columns = [f"{sample}: allele frequency" for sample in samples]
    max_vaf = df[af_columns].apply("max", axis=1)
    df["binned max vaf"] = pd.cut(
        max_vaf, [0, 0.33, 0.66, 1.0], labels=["low", "medium", "high"]
    )
    df["binned max vaf"] = pd.Categorical(
        df["binned max vaf"], ["low", "medium", "high"]
    )
    return df


calls = pd.read_csv(snakemake.input[0], sep="\t")
calls["clinical significance"] = (
    calls["clinical significance"]
    .apply(eval)
    .apply(sorted)
    .apply(", ".join)
    .replace("", np.nan)
)

calls["protein alteration (short)"] = (
    calls["protein alteration (short)"].apply(eval).apply("/".join).replace("", np.nan)
)

samples = get_samples(calls)

if calls.columns.str.endswith(": allele frequency").any():
    calls = bin_max_vaf(calls, samples)

if not calls.empty:
    # these below only work on non empty dataframes
    calls["vartype"] = calls.apply(get_vartype, axis="columns")
    order_impact(calls)
    sort_calls(calls)
else:
    calls["vartype"] = []


calls.set_index("gene", inplace=True, drop=False)

if calls.columns.str.endswith(": short ref observations").any():
    calls = join_short_obs(calls, samples)

coding = ~pd.isna(calls["hgvsp"])
canonical = calls["canonical"].notnull()
mane_plus_clinical = calls["mane_plus_clinical"].notnull()
canonical_mane = canonical | mane_plus_clinical

noncoding_calls = calls[~coding & canonical_mane]
noncoding_calls = cleanup_dataframe(noncoding_calls)
write(noncoding_calls, snakemake.output.noncoding)

coding_calls = calls[coding & canonical_mane].drop(
    [
        "end position",
        "event",
        "id",
    ],
    axis="columns",
)
coding_calls = cleanup_dataframe(coding_calls)

# coding variants
# Here we have all variant info in hgvsp,
# hence we can drop the other variant specific columns.
write(
    coding_calls,
    snakemake.output.coding,
)
1
2
3
4
5
6
7
import sys
import pandas as pd

sys.stderr = open(snakemake.log[0], "w")

data = pd.read_csv(snakemake.input.tsv, sep="\t")
data.to_excel(snakemake.output.xlsx, index=False)
ShowHide 104 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/snakemake-workflows/dna-seq-varlociraptor
Name: dna-seq-varlociraptor
Version: v5.0.1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Other Versions:
Downloaded: 0
Copyright: Public Domain
License: MIT License
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...