Neoantigen Prediction Snakemake Workflow: Genomic Variant Detection and Peptidome Incorporation

public public 1yr ago Version: neopredict-lung-v0.1 0 bookmarks

This workflow detects genomic variants with Strelka and and tries to incorporate germline and somatic variants into a sample-specific peptidome using Microphaser . Somatic neopeptides are filtered against germline peptides in terms of similarity and MHC affinity. Affinity prediction is performed using ( netMHCpan , netMHC2pan ). The workflow allows easy definition of tumor-normal pairs, where multiple tumor samples (e.g. metastasis) can be grouped with the same normal sample.

Software requirements Unfortunately, the use of netMHCpan and netMHCIIpan is restricted to academic use only. To use those tools in the pipeline, please download them separately following instructions at https://services.healthtech.dtu.dk/software.php. Please specify the path to your netMHC folder in the config.yaml .

Code Snippets

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


shell(
    "picard MarkDuplicates {snakemake.params} INPUT={snakemake.input} "
    "OUTPUT={snakemake.output.bam} METRICS_FILE={snakemake.output.metrics} "
    "&> {snakemake.log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


import os
from snakemake.shell import shell

extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

fq1 = snakemake.input.get("fq1")
assert fq1 is not None, "input-> fq1 is a required input parameter"
fq1 = (
    [snakemake.input.fq1]
    if isinstance(snakemake.input.fq1, str)
    else snakemake.input.fq1
)
fq2 = snakemake.input.get("fq2")
if fq2:
    fq2 = (
        [snakemake.input.fq2]
        if isinstance(snakemake.input.fq2, str)
        else snakemake.input.fq2
    )
    assert len(fq1) == len(
        fq2
    ), "input-> equal number of files required for fq1 and fq2"
input_str_fq1 = ",".join(fq1)
input_str_fq2 = ",".join(fq2) if fq2 is not None else ""
input_str = " ".join([input_str_fq1, input_str_fq2])

if fq1[0].endswith(".gz"):
    readcmd = "--readFilesCommand zcat"
else:
    readcmd = ""

outprefix = os.path.dirname(snakemake.output[0]) + "/"

shell(
    "STAR "
    "{extra} "
    "--runThreadN {snakemake.threads} "
    "--genomeDir {snakemake.params.index} "
    "--readFilesIn {input_str} "
    "{readcmd} "
    "--outFileNamePrefix {outprefix} "
    "--outStd Log "
    "{log}"
)
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
__author__ = "Thibault Dayris"
__copyright__ = "Copyright 2019, Dayris Thibault"
__email__ = "[email protected]"
__license__ = "MIT"

from snakemake.shell import shell
from snakemake.utils import makedirs

log = snakemake.log_fmt_shell(stdout=True, stderr=True)

extra = snakemake.params.get("extra", "")
sjdb_overhang = snakemake.params.get("sjdbOverhang", "100")

gtf = snakemake.input.get("gtf")
if gtf is not None:
    gtf = "--sjdbGTFfile " + gtf
    sjdb_overhang = "--sjdbOverhang " + sjdb_overhang
else:
    gtf = sjdb_overhang = ""

makedirs(snakemake.output)

shell(
    "STAR "  # Tool
    "--runMode genomeGenerate "  # Indexation mode
    "{extra} "  # Optional parameters
    "--runThreadN {snakemake.threads} "  # Number of threads
    "--genomeDir {snakemake.output} "  # Path to output
    "--genomeFastaFiles {snakemake.input.fasta} "  # Path to fasta files
    "{sjdb_overhang} "  # Read-len - 1
    "{gtf} "  # Highly recommended GTF
    "{log}"  # Logging
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
__author__ = "Patrik Smeds"
__copyright__ = "Copyright 2016, Patrik Smeds"
__email__ = "[email protected]"
__license__ = "MIT"

from os import path

from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

# Check inputs/arguments.
if len(snakemake.input) == 0:
    raise ValueError("A reference genome has to be provided!")
elif len(snakemake.input) > 1:
    raise ValueError("Only one reference genome can be inputed!")

# Prefix that should be used for the database
prefix = snakemake.params.get("prefix", "")

if len(prefix) > 0:
    prefix = "-p " + prefix

# Contrunction algorithm that will be used to build the database, default is bwtsw
construction_algorithm = snakemake.params.get("algorithm", "")

if len(construction_algorithm) != 0:
    construction_algorithm = "-a " + construction_algorithm

shell(
    "bwa index" " {prefix}" " {construction_algorithm}" " {snakemake.input[0]}" " {log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2018, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)

shell(
    "picard "
    "CreateSequenceDictionary "
    "{extra} "
    "R={snakemake.input[0]} "
    "O={snakemake.output[0]} "
    "{log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2019, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

from snakemake.shell import shell

species = snakemake.params.species.lower()
release = snakemake.params.release
fmt = snakemake.params.fmt
build = snakemake.params.build

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

suffix = ""
if fmt == "gtf":
    suffix = "gtf.gz"
elif fmt == "gff3":
    suffix = "gff3.gz"

url = "ftp://ftp.ensembl.org/pub/release-{release}/{fmt}/{species}/{species_cap}.{build}.{release}.{suffix}".format(
    release=release,
    build=build,
    species=species,
    fmt=fmt,
    species_cap=species.capitalize(),
    suffix=suffix,
)

shell("(curl -L {url} | gzip -d > {snakemake.output[0]}) {log}")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2019, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

import subprocess as sp
from snakemake.shell import shell

species = snakemake.params.species.lower()
release = snakemake.params.release
build = snakemake.params.build

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

suffixes = ""
datatype = snakemake.params.get("datatype", "")
if datatype == "dna":
    suffixes = ["dna.primary_assembly.fa.gz", "dna.toplevel.fa.gz"]
elif datatype == "cdna":
    suffixes = ["cdna.all.fa.gz"]
elif datatype == "cds":
    suffixes = ["cds.all.fa.gz"]
elif datatype == "ncrna":
    suffixes = ["ncrna.fa.gz"]
elif datatype == "pep":
    suffixes = ["pep.all.fa.gz"]
else:
    raise ValueError("invalid datatype, must be one of dna, cdna, cds, ncrna, pep")

success = False
for suffix in suffixes:
    url = "ftp://ftp.ensembl.org/pub/release-{release}/fasta/{species}/{datatype}/{species_cap}.{build}.{suffix}".format(
        release=release,
        species=species,
        datatype=datatype,
        build=build,
        suffix=suffix,
        species_cap=species.capitalize(),
    )

    try:
        shell("curl -sSf {url} > /dev/null 2> /dev/null")
    except sp.CalledProcessError:
        continue

    shell("(curl -L {url} | gzip -d > {snakemake.output[0]}) {log}")
    success = True
    break

if not success:
    raise ValueError(
        "Requested sequence does not seem to exist on ensembl FTP servers or servers are unavailable (url {})".format(
            url
        )
    )
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
__author__ = "Michael Chambers"
__copyright__ = "Copyright 2019, Michael Chambers"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


shell("samtools faidx {snakemake.params} {snakemake.input[0]} > {snakemake.output[0]}")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
__author__ = "Johannes Köster, Julian de Ruiter"
__copyright__ = "Copyright 2016, Johannes Köster and Julian de Ruiter"
__email__ = "[email protected], [email protected]"
__license__ = "MIT"


from os import path

from snakemake.shell import shell


# Extract arguments.
extra = snakemake.params.get("extra", "")

sort = snakemake.params.get("sort", "none")
sort_order = snakemake.params.get("sort_order", "coordinate")
sort_extra = snakemake.params.get("sort_extra", "")

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

# Check inputs/arguments.
if not isinstance(snakemake.input.reads, str) and len(snakemake.input.reads) not in {
    1,
    2,
}:
    raise ValueError("input must have 1 (single-end) or " "2 (paired-end) elements")

if sort_order not in {"coordinate", "queryname"}:
    raise ValueError("Unexpected value for sort_order ({})".format(sort_order))

# Determine which pipe command to use for converting to bam or sorting.
if sort == "none":

    # Simply convert to bam using samtools view.
    pipe_cmd = "samtools view -Sbh -o {snakemake.output[0]} -"

elif sort == "samtools":

    # Sort alignments using samtools sort.
    pipe_cmd = "samtools sort {sort_extra} -o {snakemake.output[0]} -"

    # Add name flag if needed.
    if sort_order == "queryname":
        sort_extra += " -n"

    prefix = path.splitext(snakemake.output[0])[0]
    sort_extra += " -T " + prefix + ".tmp"

elif sort == "picard":

    # Sort alignments using picard SortSam.
    pipe_cmd = (
        "picard SortSam {sort_extra} INPUT=/dev/stdin"
        " OUTPUT={snakemake.output[0]} SORT_ORDER={sort_order}"
    )

else:
    raise ValueError("Unexpected value for params.sort ({})".format(sort))

shell(
    "(bwa mem"
    " -t {snakemake.threads}"
    " {extra}"
    " {snakemake.params.index}"
    " {snakemake.input.reads}"
    " | " + pipe_cmd + ") {log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
__author__ = "Johannes Köster, Derek Croote"
__copyright__ = "Copyright 2020, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

import os
import tempfile
from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=True, stderr=True)

outdir = os.path.dirname(snakemake.output[0])
if outdir:
    outdir = "--outdir {}".format(outdir)

extra = snakemake.params.get("extra", "")

with tempfile.TemporaryDirectory() as tmp:
    shell(
        "fasterq-dump --temp {tmp} --threads {snakemake.threads} "
        "{extra} {outdir} {snakemake.wildcards.accession} {log}"
    )
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


shell(
    "bcftools concat {snakemake.params} -o {snakemake.output[0]} "
    "{snakemake.input.calls}"
)
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


n = len(snakemake.input)
assert n == 2, "Input must contain 2 (paired-end) elements."

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

shell(
    "cutadapt"
    " {snakemake.params.adapters}"
    " {snakemake.params.others}"
    " -o {snakemake.output.fastq1}"
    " -p {snakemake.output.fastq2}"
    " -j {snakemake.threads}"
    " {snakemake.input}"
    " > {snakemake.output.qc} {log}"
)
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
__author__ = "Julian de Ruiter"
__copyright__ = "Copyright 2017, Julian de Ruiter"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


log = snakemake.log_fmt_shell(stdout=False, stderr=True)

shell(
    "cutadapt"
    " {snakemake.params}"
    " -j {snakemake.threads}"
    " -o {snakemake.output.fastq}"
    " {snakemake.input[0]}"
    " > {snakemake.output.qc} {log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2019, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

import tempfile
import subprocess
import sys
import os
from snakemake.shell import shell
from snakemake.exceptions import WorkflowError

species = snakemake.params.species.lower()
release = int(snakemake.params.release)
build = snakemake.params.build
type = snakemake.params.type

if release < 98:
    print("Ensembl releases <98 are unsupported.", file=open(snakemake.log[0], "w"))
    exit(1)

branch = ""
if release >= 81 and build == "GRCh37":
    # use the special grch37 branch for new releases
    branch = "grch37/"

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

if type == "all":
    if species == "homo_sapiens" and release >= 93:
        suffixes = [
            "-chr{}".format(chrom) for chrom in list(range(1, 23)) + ["X", "Y", "MT"]
        ]
    else:
        suffixes = [""]
elif type == "somatic":
    suffixes = ["_somatic"]
elif type == "structural_variations":
    suffixes = ["_structural_variations"]
else:
    raise ValueError(
        "Unsupported type {} (only all, somatic, structural_variations are allowed)".format(
            type
        )
    )

species_filename = species if release >= 91 else species.capitalize()

urls = [
    "ftp://ftp.ensembl.org/pub/{branch}release-{release}/variation/vcf/{species}/{species_filename}{suffix}.{ext}".format(
        release=release,
        species=species,
        suffix=suffix,
        species_filename=species_filename,
        branch=branch,
        ext=ext,
    )
    for suffix in suffixes
    for ext in ["vcf.gz", "vcf.gz.csi"]
]
names = [os.path.basename(url) for url in urls if url.endswith(".gz")]

try:
    gather = "curl {urls}".format(urls=" ".join(map("-O {}".format, urls)))
    workdir = os.getcwd()
    with tempfile.TemporaryDirectory() as tmpdir:
        if snakemake.input.get("fai"):
            shell(
                "(cd {tmpdir}; {gather} && "
                "bcftools concat -Oz --naive {names} > concat.vcf.gz && "
                "bcftools reheader --fai {workdir}/{snakemake.input.fai} concat.vcf.gz "
                "> {workdir}/{snakemake.output}) {log}"
            )
        else:
            shell(
                "(cd {tmpdir}; {gather} && "
                "bcftools concat -Oz --naive {names} "
                "> {workdir}/{snakemake.output}) {log}"
            )
except subprocess.CalledProcessError as e:
    if snakemake.log:
        sys.stderr = open(snakemake.log[0], "a")
    print(
        "Unable to download variation data from Ensembl. "
        "Did you check that this combination of species, build, and release is actually provided? ",
        file=sys.stderr,
    )
    exit(1)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


shell("samtools index {snakemake.params} {snakemake.input[0]} {snakemake.output[0]}")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

shell("tabix {snakemake.params} {snakemake.input[0]} {log}")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2020, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

from pathlib import Path
from snakemake.shell import shell


def get_only_child_dir(path):
    children = [child for child in path.iterdir() if child.is_dir()]
    assert (
        len(children) == 1
    ), "Invalid VEP cache directory, only a single entry is allowed, make sure that cache was created with the snakemake VEP cache wrapper"
    return children[0]


extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=False, stderr=True)

fork = "--fork {}".format(snakemake.threads) if snakemake.threads > 1 else ""
stats = snakemake.output.stats
cache = snakemake.input.cache
plugins = snakemake.input.plugins

entrypath = get_only_child_dir(get_only_child_dir(Path(cache)))
species = entrypath.parent.name
release, build = entrypath.name.split("_")

load_plugins = " ".join(map("--plugin {}".format, snakemake.params.plugins))

if snakemake.output.calls.endswith(".vcf.gz"):
    fmt = "z"
elif snakemake.output.calls.endswith(".bcf"):
    fmt = "b"
else:
    fmt = "v"

shell(
    "(bcftools view {snakemake.input.calls} | "
    "vep {extra} {fork} "
    "--format vcf "
    "--vcf "
    "--cache "
    "--cache_version {release} "
    "--species {species} "
    "--assembly {build} "
    "--dir_cache {cache} "
    "--dir_plugins {plugins} "
    "--offline "
    "{load_plugins} "
    "--output_file STDOUT "
    "--stats_file {stats} | "
    "bcftools view -O{fmt} > {snakemake.output.calls}) {log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2020, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

from pathlib import Path
from snakemake.shell import shell

extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

shell(
    "vep_install --AUTO cf "
    "--SPECIES {snakemake.params.species} "
    "--ASSEMBLY {snakemake.params.build} "
    "--CACHE_VERSION {snakemake.params.release} "
    "--CACHEDIR {snakemake.output} "
    "--CONVERT "
    "--NO_UPDATE {log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2020, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"

import sys
from pathlib import Path
from urllib.request import urlretrieve
from zipfile import ZipFile
from tempfile import NamedTemporaryFile

if snakemake.log:
    sys.stderr = open(snakemake.log[0], "w")

outdir = Path(snakemake.output[0])
outdir.mkdir()

with NamedTemporaryFile() as tmp:
    urlretrieve(
        "https://github.com/Ensembl/VEP_plugins/archive/release/{release}.zip".format(
            release=snakemake.params.release
        ),
        tmp.name,
    )

    with ZipFile(tmp.name) as f:
        for member in f.infolist():
            memberpath = Path(member.filename)
            if len(memberpath.parts) == 1:
                # skip root dir
                continue
            targetpath = outdir / memberpath.relative_to(memberpath.parts[0])
            if member.is_dir():
                targetpath.mkdir()
            else:
                with open(targetpath, "wb") as out:
                    out.write(f.read(member.filename))
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


shell(
    "bcftools concat {snakemake.params} -o {snakemake.output[0]} "
    "{snakemake.input.calls}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell

## Extract arguments
extra = snakemake.params.get("extra", "")

shell("bcftools index" " {extra}" " {snakemake.input[0]}")
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
__author__ = "Jan Forster"
__copyright__ = "Copyright 2020, Jan Forster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell

## Extract arguments
header = snakemake.input.get("header", "")
if header:
    header_cmd = "-h " + header
else:
    header_cmd = ""

samples = snakemake.input.get("samples", "")
if samples:
    samples_cmd = "-s " + samples
else:
    samples_cmd = ""

extra = snakemake.params.get("extra", "")
view_extra = snakemake.params.get("view_extra", "")

shell(
    "bcftools reheader "
    "{extra} "
    "{header_cmd} "
    "{samples_cmd} "
    "{snakemake.input.vcf} "
    "| bcftools view "
    "{view_extra} "
    "> {snakemake.output}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


shell(
    "bcftools view {snakemake.params} {snakemake.input[0]} " "-o {snakemake.output[0]}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
__author__ = "Jan Forster"
__copyright__ = "Copyright 2019, Jan Forster"
__email__ = "[email protected]"
__license__ = "MIT"

from os import path

from snakemake.shell import shell

## Extract arguments
extra = snakemake.params.get("extra", "")

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

shell(
    "(pre.py"
    " --threads {snakemake.threads}"
    " -r {snakemake.params.genome}"
    " {extra}"
    " {snakemake.input.variants}"
    " {snakemake.output})"
    " {log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
__author__ = "Jan Forster"
__copyright__ = "Copyright 2019, Jan Forster"
__email__ = "[email protected]"
__license__ = "MIT"


import os
from snakemake.shell import shell

extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

discarded_fusions = snakemake.output.get("discarded", "")
if discarded_fusions:
    discarded_cmd = "-O " + discarded_fusions
else:
    discarded_cmd = ""

blacklist = snakemake.params.get("blacklist")
if blacklist:
    blacklist_cmd = "-b " + blacklist
else:
    blacklist_cmd = ""

known_fusions = snakemake.params.get("known_fusions")
if known_fusions:
    known_cmd = "-k" + known_fusions
else:
    known_cmd = ""

sv_file = snakemake.params.get("sv_file")
if sv_file:
    sv_cmd = "-d" + sv_file
else:
    sv_cmd = ""

shell(
    "arriba "
    "-x {snakemake.input.bam} "
    "-a {snakemake.input.genome} "
    "-g {snakemake.input.annotation} "
    "{blacklist_cmd} "
    "{known_cmd} "
    "{sv_cmd} "
    "-o {snakemake.output.fusions} "
    "{discarded_cmd} "
    "{extra} "
    "{log}"
)
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
__author__ = "Joël Simoneau"
__copyright__ = "Copyright 2019, Joël Simoneau"
__email__ = "[email protected]"
__license__ = "MIT"

from snakemake.shell import shell

# Creating log
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

# Placeholder for optional parameters
extra = snakemake.params.get("extra", "")

# Allowing for multiple FASTA files
fasta = snakemake.input.get("fasta")
assert fasta is not None, "input-> a FASTA-file is required"
fasta = " ".join(fasta) if isinstance(fasta, list) else fasta

shell(
    "kallisto index "  # Tool
    "{extra} "  # Optional parameters
    "--index={snakemake.output.index} "  # Output file
    "{fasta} "  # Input FASTA files
    "{log}"  # Logging
)
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
__author__ = "Joël Simoneau"
__copyright__ = "Copyright 2019, Joël Simoneau"
__email__ = "[email protected]"
__license__ = "MIT"

from snakemake.shell import shell

# Creating log
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

# Placeholder for optional parameters
extra = snakemake.params.get("extra", "")

# Allowing for multiple FASTQ files
fastq = snakemake.input.get("fastq")
assert fastq is not None, "input-> a FASTQ-file is required"
fastq = " ".join(fastq) if isinstance(fastq, list) else fastq

shell(
    "kallisto quant "  # Tool
    "{extra} "  # Optional parameters
    "--threads={snakemake.threads} "  # Number of threads
    "--index={snakemake.input.index} "  # Input file
    "--output-dir={snakemake.output} "  # Output directory
    "{fastq} "  # Input FASTQ files
    "{log}"  # Logging
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
__author__ = "Jan Forster"
__copyright__ = "Copyright 2020, Jan Forster"
__email__ = "[email protected]"
__license__ = "MIT"


import os
from snakemake.shell import shell

extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)


shell(
    "(razers3"
    " -tc {snakemake.threads}"
    " {extra}"
    " -o {snakemake.output[0]}"
    " {snakemake.params.genome}"
    " {snakemake.input.reads})"
    " {log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
__author__ = "David Laehnemann, Victoria Sack"
__copyright__ = "Copyright 2018, David Laehnemann, Victoria Sack"
__email__ = "[email protected]"
__license__ = "MIT"


import os
from snakemake.shell import shell


prefix = os.path.splitext(snakemake.output[0])[0]

shell(
    "samtools bam2fq {snakemake.params} "
    " -@ {snakemake.threads} "
    " {snakemake.input[0]}"
    " >{snakemake.output[0]} "
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
__author__ = "Christopher Schröder"
__copyright__ = "Copyright 2020, Christopher Schröder"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell

extra = snakemake.params.get("extra", "")
java_opts = snakemake.params.get("java_opts", "")

log = snakemake.log_fmt_shell(stdout=True, stderr=True, append=True)
shell(
    "gatk --java-options '{java_opts}' ApplyBQSR {extra} -R {snakemake.input.ref} -I {snakemake.input.bam} "
    "--bqsr-recal-file {snakemake.input.recal_table} "
    "-O {snakemake.output.bam} {log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
__author__ = "Christopher Schröder"
__copyright__ = "Copyright 2020, Christopher Schröder"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell

extra = snakemake.params.get("extra", "")
spark_runner = snakemake.params.get("spark_runner", "LOCAL")
spark_master = snakemake.params.get(
    "spark_master", "local[{}]".format(snakemake.threads)
)
spark_extra = snakemake.params.get("spark_extra", "")
java_opts = snakemake.params.get("java_opts", "")

log = snakemake.log_fmt_shell(stdout=True, stderr=True)
known = snakemake.input.get("known", "")
if known:
    known = "--known-sites {}".format(known)

shell(
    "gatk --java-options '{java_opts}' BaseRecalibratorSpark {extra} "
    "-R {snakemake.input.ref} -I {snakemake.input.bam} "
    "-O {snakemake.output.recal_table} {known} "
    "-- --spark-runner {spark_runner} --spark-master {spark_master} {spark_extra} "
    "{log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
__author__ = "Jan Forster"
__copyright__ = "Copyright 2020, Jan Forster"
__email__ = "[email protected]"
__license__ = "MIT"


import os
from snakemake.shell import shell

extra = snakemake.params.get("extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)
outdir = os.path.dirname(snakemake.output[0])

# get sequencing type
seq_type = snakemake.params.get("sequencing_type", "dna")
seq_type = "--{}".format(seq_type)

# check if non-default config.ini is used
config = snakemake.params.get("config", "")
if any(config):
    config = "--config {}".format(config)

shell(
    "(OptiTypePipeline.py"
    " --input {snakemake.input.reads}"
    " --outdir {outdir}"
    " --prefix {snakemake.wildcards.sample}"
    " {seq_type}"
    " {config}"
    " {extra})"
    " {log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


shell(
    "bcftools concat {snakemake.params} -o {snakemake.output[0]} "
    "{snakemake.input.calls}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
__author__ = "Dayne Filer"
__copyright__ = "Copyright 2019, Dayne Filer"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


shell(
    "bcftools norm {snakemake.params} {snakemake.input[0]} " "-o {snakemake.output[0]}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
__author__ = "Johannes Köster, Felix Mölder"
__copyright__ = "Copyright 2017, Johannes Köster"
__email__ = "[email protected], [email protected]"
__license__ = "MIT"


from snakemake.shell import shell

shell.executable("bash")

log = snakemake.log_fmt_shell(stdout=False, stderr=True)

params = snakemake.params.get("extra", "")

pipe = ""
if snakemake.output[0].endswith(".bcf"):
    pipe = "| bcftools view -Ob -"

if snakemake.threads == 1:
    freebayes = "freebayes"
else:
    chunksize = snakemake.params.get("chunksize", 100000)
    regions = "<(fasta_generate_regions.py {snakemake.input.ref}.fai {chunksize})".format(
        snakemake=snakemake, chunksize=chunksize
    )
    if snakemake.input.get("regions", ""):
        regions = (
            "<(bedtools intersect -a "
            r"<(sed 's/:\([0-9]*\)-\([0-9]*\)$/\t\1\t\2/' "
            "{regions}) -b {snakemake.input.regions} | "
            r"sed 's/\t\([0-9]*\)\t\([0-9]*\)$/:\1-\2/')"
        ).format(regions=regions, snakemake=snakemake)
    freebayes = ("freebayes-parallel {regions} {snakemake.threads}").format(
        snakemake=snakemake, regions=regions
    )

shell(
    "({freebayes} {params} -f {snakemake.input.ref}"
    " {snakemake.input.samples} {pipe} > {snakemake.output[0]}) {log}"
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
__author__ = "Jan Forster"
__copyright__ = "Copyright 2019, Jan Forster"
__email__ = "[email protected]"
__license__ = "MIT"


import os
from pathlib import Path
from snakemake.shell import shell

config_extra = snakemake.params.get("config_extra", "")
run_extra = snakemake.params.get("run_extra", "")
log = snakemake.log_fmt_shell(stdout=True, stderr=True)

bam = snakemake.input.get("bam")  # input bam file, required
assert bam is not None, "input-> bam is a required input parameter"

if snakemake.output[0].endswith(".vcf.gz"):
    run_dir = Path(snakemake.output[0]).parents[2]
else:
    run_dir = snakemake.output

shell(
    "configureStrelkaGermlineWorkflow.py "  # configure the strelka run
    "--bam {bam} "  # input bam
    "--referenceFasta {snakemake.input.fasta} "  # reference genome
    "--runDir {run_dir} "  # output directory
    "{config_extra} "  # additional parameters for the configuration
    "&& {run_dir}/runWorkflow.py "  # run the strelka workflow
    "-m local "  # run in local mode
    "-j {snakemake.threads} "  # number of threads
    "{run_extra} "  # additional parameters for the run
    "{log}"
)  # logging
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
__author__ = "Thibault Dayris"
__copyright__ = "Copyright 2019, Dayris Thibault"
__email__ = "[email protected]"
__license__ = "MIT"

from pathlib import Path
from snakemake.shell import shell
from snakemake.utils import makedirs

log = snakemake.log_fmt_shell(stdout=True, stderr=True)

config_extra = snakemake.params.get("config_extra", "")
run_extra = snakemake.params.get("run_extra", "")

# If a normal bam is given in input,
# then it should be provided in the input
# block, so Snakemake will perform additional
# tests on file existance.
normal = (
    "--normalBam {}".format(snakemake.input["normal"])
    if "normal" in snakemake.input.keys()
    else ""
)

if snakemake.output[0].endswith("vcf.gz"):
    run_dir = Path(snakemake.output[0]).parents[2]
else:
    run_dir = snakemake.output

shell(
    "(configureStrelkaSomaticWorkflow.py "  # Configuration script
    "{normal} "  # Path to normal bam (if any)
    "--tumorBam {snakemake.input.tumor} "  # Path to tumor bam
    "--referenceFasta {snakemake.input.fasta} "  # Path to fasta file
    "--runDir {run_dir} "  # Path to output directory
    "{config_extra} "  # Extra parametersfor configuration
    " && "
    "{run_dir}/runWorkflow.py "  # Run the pipeline
    "--mode local "  # Stop internal job submission
    "--jobs {snakemake.threads} "  # Nomber of threads
    "{run_extra}) "  # Extra parameters for runWorkflow
    "{log}"  # Logging behaviour
)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
__author__ = "Johannes Köster"
__copyright__ = "Copyright 2016, Johannes Köster"
__email__ = "[email protected]"
__license__ = "MIT"


from snakemake.shell import shell


shell(
    "bcftools concat {snakemake.params} -o {snakemake.output[0]} "
    "{snakemake.input.calls}"
)
20
21
wrapper:
    "0.59.2/bio/vep/annotate"
43
44
wrapper:
    "0.59.2/bio/vep/annotate"
21
22
wrapper:
    "0.65.0/bio/strelka/somatic"
42
43
wrapper:
    "0.65.0/bio/strelka/germline"
55
56
wrapper:
    "0.60.0/bio/bcftools/view"
75
76
wrapper:
    "0.60.0/bio/bcftools/concat"
88
89
wrapper:
    "0.60.0/bio/bcftools/view"
103
104
wrapper:
    "0.60.0/bio/bcftools/reheader"
117
118
wrapper:
    "0.64.0/bio/bcftools/concat"
132
133
wrapper:
    "0.60.0/bio/hap.py/pre.py"
146
147
wrapper:
    "0.65.0/bio/bcftools/norm"
14
15
wrapper:
    "0.65.0/bio/freebayes"
27
28
shell:
    "rbt vcf-split {input} {output}"
12
13
shell:
    'vembrane filter --output-fmt bcf --output {output} "{params.filter}" {input} &> {log}'
29
30
shell:
    "varlociraptor filter-calls posterior-odds --events {params.events} --odds barely < {input} > {output} 2> {log}"
47
48
wrapper:
    "0.67.0/bio/bcftools/concat"
62
63
64
shell:
    "varlociraptor filter-calls control-fdr {input} --var {wildcards.vartype} "
    "--events {params.query[events]} --fdr {params.query[threshold]} > {output} 2> {log}"
77
78
wrapper:
    "0.59.2/bio/bcftools/concat"
90
91
shell:
    "echo -e 'normal {params.prefix}_N\ntumor {params.prefix}_T' > {output}"
105
106
wrapper:
    "0.60.0/bio/bcftools/reheader"
16
17
shell:
    "HLA-LA.pl --bam {input.bam} --sampleID {wildcards.sample} --graph {params.graph} --customGraphDir {params.graphdir} --workingDir results/HLA-LA/output --maxThreads {threads} > {log} 2>&1"
36
37
script:
    "../scripts/parse_HLA_types.py"
51
52
wrapper:
    "0.61.0/bio/razers3"
65
66
wrapper:
    "0.61.0/bio/samtools/bam2fq/interleaved"
81
82
wrapper:
    "0.63.0/bio/optitype"
96
97
98
shell:
    "cut {input} -f2-7 | awk 'NR == 1 {{print}} NR>1 {{for (i = 1; i<=6; ++i) sub(/^/, \"&HLA-\", $i); print}}' "
    '| sed -e s/[*,:]//g | sed "s/ /\t/g" > {output} 2> {log}'
15
16
wrapper:
    "0.56.0/bio/bwa/mem"
29
30
wrapper:
    "0.39.0/bio/picard/markduplicates"
50
51
wrapper:
    "0.62.0/bio/gatk/baserecalibratorspark"
69
70
wrapper:
    "0.62.0/bio/gatk/applybqsr"
30
31
32
33
run:
    alleles = ",".join(pd.read_csv(input.alleles, sep="\t").iloc[0])
    cmd = "if [ -s {input.peptides} ]; then {params.netMHC}/netMHCpan {params.extra} -xlsfile {output} -a {alleles} -f {input.peptides} > {log}; else touch {output}; fi"
    shell(cmd)
47
48
49
50
run:
    alleles = ",".join(pd.read_csv(input.alleles, sep="\t")["Allele"].tolist())
    cmd = "if [ -s {input.peptides} ]; then {params.netMHC}/netMHCIIpan {params.extra} -xlsfile {output} -a {alleles} -f {input.peptides} > {log}; else touch {output}; fi"
    shell(cmd)
65
66
script:
    "../scripts/group_mhc_output.py"
97
98
script:
    "../scripts/merge_data.py"
126
127
script:
    "../scripts/add_rna_info.py"
18
19
20
shell:
    "microphaser somatic {input.bam} --variants {input.vcf} --ref {input.ref} --tsv {output.tsv} -n {output.wt_fasta} -w {params.window_length} "
    "< {input.track} > {output.mt_fasta} 2> {log}"
43
44
45
shell:
    "microphaser normal {input.bam} --variants {input.vcf} --ref {input.ref} -t {output.wt_tsv} -w {params.window_length} "
    "< {input.track} > {output.wt_fasta} 2> {log}"
58
59
shell:
    "cat {input} > {output} 2> {log}"
76
77
shell:
    "microphaser build_reference -r {input} -o {output.bin} -l {params.length} --peptides {output.fasta} > {log} 2>&1"
101
102
shell:
    "microphaser filter -r {input.proteome} -t {input.tsv} -o {output.tsv} -n {output.wt_fasta} -s {output.removed} -l {params.length} > {output.mt_fasta} 2>{log}"
117
118
shell:
    "xsv cat rows -d '\t' {input} | xsv fmt -t '\t' -d ',' > {output} 2>{log}"
12
13
wrapper:
    "0.45.1/bio/reference/ensembl-sequence"
SnakeMake From line 12 of rules/ref.smk
27
28
wrapper:
    "0.45.1/bio/reference/ensembl-sequence"
SnakeMake From line 27 of rules/ref.smk
41
42
wrapper:
    "0.60.1/bio/kallisto/index"
SnakeMake From line 41 of rules/ref.smk
57
58
wrapper:
    "0.45.1/bio/reference/ensembl-annotation"
SnakeMake From line 57 of rules/ref.smk
74
75
wrapper:
    "0.42.0/bio/star/index"
SnakeMake From line 74 of rules/ref.smk
85
86
shell:
    "grep '^{wildcards.contig}\t' {input} > {output}"
SnakeMake From line 85 of rules/ref.smk
97
98
wrapper:
    "0.45.1/bio/samtools/faidx"
SnakeMake From line 97 of rules/ref.smk
109
110
wrapper:
    "0.45.1/bio/picard/createsequencedictionary"
SnakeMake From line 109 of rules/ref.smk
124
125
126
shell:
    "paste <(cut -f1 {input}) <(yes 0 | head -n {params.n_contigs}) <(cut -f2 {input})"
    " | head -n {params.n_contigs} | bgzip -c > {output} && tabix -p bed {output}"
143
144
wrapper:
    "0.59.2/bio/reference/ensembl-variation"
SnakeMake From line 143 of rules/ref.smk
157
158
shell:
    "rbt vcf-fix-iupac-alleles < {input} | bcftools view -Oz > {output}"
169
170
wrapper:
    "0.45.1/bio/bwa/index"
SnakeMake From line 169 of rules/ref.smk
187
188
189
shell:
    "cd resources/graphs && wget  http://www.well.ox.ac.uk/downloads/PRG_MHC_GRCh38_withIMGT.tar.gz "
    "&& tar -xvzf PRG_MHC_GRCh38_withIMGT.tar.gz"
SnakeMake From line 187 of rules/ref.smk
206
207
shell:
    "HLA-LA.pl --prepareGraph 1 --customGraphDir {params.path} --graph {params.graph} > {log} 2>&1"
220
221
wrapper:
    "0.59.2/bio/vep/cache"
SnakeMake From line 220 of rules/ref.smk
232
233
wrapper:
    "0.59.2/bio/vep/plugins"
SnakeMake From line 232 of rules/ref.smk
241
242
shell:
    "echo 'TUMOR' > {output}"
SnakeMake From line 241 of rules/ref.smk
11
12
wrapper:
    "0.60.1/bio/kallisto/quant"
35
36
wrapper:
    "0.42.0/bio/star/align"
SnakeMake From line 35 of rules/RNA.smk
53
54
wrapper:
    "0.60.1/bio/arriba"
14
15
16
17
18
19
20
shell:
    "varlociraptor estimate tmb "
    " --plot-mode {wildcards.plotmode} "
    "--coding-genome-size {params.coding_genome_size} "
    "--somatic-tumor-events {params.somatic_events} "
    "--tumor-sample {params.tumor_sample} "
    "< {input} > {output} 2> {log}"
7
8
wrapper:
    "0.56.0/bio/sra-tools/fasterq-dump"
21
22
shell:
    "cat {input} > {output} 2> {log}"
SnakeMake From line 21 of rules/trim.smk
40
41
wrapper:
    "0.59.2/bio/cutadapt/pe"
SnakeMake From line 40 of rules/trim.smk
56
57
wrapper:
    "0.59.2/bio/cutadapt/se"
SnakeMake From line 56 of rules/trim.smk
70
71
shell:
    "cat {input} > {output} 2> {log}"
SnakeMake From line 70 of rules/trim.smk
8
9
wrapper:
    "0.60.0/bio/bcftools/index"
19
20
wrapper:
    "0.59.2/bio/samtools/index"
33
34
wrapper:
    "0.59.2/bio/tabix"
44
45
shell:
    "gzip < {input} > {output}"
57
58
script:
    "../scripts/tsv_to_xlsx.py"
16
17
script:
    "../scripts/render-scenario.py"
35
36
37
shell:
    "varlociraptor preprocess variants {params.omit_isize} --candidates {input.candidates} "
    "{input.ref} --bam {input.bam} --output {output} 2> {log}"
56
57
58
59
shell:
    "varlociraptor "
    "call variants generic --obs {params.obs} "
    "--scenario {input.scenario} > {output} 2> {log}"
73
74
75
shell:
    "bcftools sort --max-mem {resources.mem_mb}M --temp-dir `mktemp -d` "
    "-Ob {input} > {output} 2> {log}"
88
89
wrapper:
    "0.59.2/bio/bcftools/concat"
14
15
shell:
    "vl2svg {input} {output} > {log} 2>&1"
SnakeMake From line 14 of rules/vega.smk
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
import pandas as pd

## load data table
data = pd.read_csv(snakemake.input["table"], sep='\t')

## Merge transcript count
transcript_count = pd.read_csv(snakemake.params["abundance"], sep='\t')
transcript_count = transcript_count[["target_id", "tpm"]]
transcript_count.columns = ["Transcript_ID", "TPM"]
transcript_count["Transcript_ID"] = transcript_count["Transcript_ID"].str.split('.', expand=True)[0]
data = data.merge(transcript_count, on="Transcript_ID", how="left")

data.to_csv(snakemake.output[0], sep='\t', index=False)
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import sys
import os
import pandas as pd
import numpy as np

first = True
out = open(snakemake.output[0], "w")
for e in snakemake.input:
    if os.path.getsize(e) > 0:
        mhcout = open(e, 'r')
        alleles = next(mhcout).split('\t')
        header = next(mhcout).rstrip().split('\t')
        if first:
            allele = ''
            for i in range(0, len(header)):
                #print(header[i].rstrip())
                if i < len(alleles):
                    #print(alleles[i])
                    if alleles[i] != '':
                        allele = alleles[i].rstrip() + '_'
                header[i] = allele + header[i].rstrip()
            header[len(header) -1] = "NB"
            first = False
            #print(header)
            out.write('\t'.join(header) + '\n')
        for line in mhcout:
            out.write(line)
out.close()
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import sys
import os
import pandas as pd
import numpy as np


def select_columns(mhc):
    rank_cols = [c for c in mhc.columns if "Rank" in c]
    affinity_cols = [c for c in mhc.columns if "nM" in c]
    mhc_cols = ["Pos"] + ["ID"] + ["Peptide"] + rank_cols + affinity_cols + ["NB"]
    mhc = mhc[mhc_cols]
    mhc["Rank_min"] = mhc[rank_cols].min(axis=1)
    mhc["Aff_min"] = mhc[affinity_cols].min(axis=1)
    mhc["Top_rank_HLA"] = mhc[rank_cols].idxmin(axis=1)
    mhc["Top_affinity_HLA"] = mhc[affinity_cols].idxmin(axis=1)
    mhc["Top_rank_HLA"] = mhc["Top_rank_HLA"].str.replace("_Rank","")
    mhc["Top_affinity_HLA"] = mhc["Top_affinity_HLA"].str.replace("_nM","")
    return mhc

def merge(info, tumor, normal, outfile):
    tumor = select_columns(tumor)
    normal = select_columns(normal)
    id_length = len(tumor.ID[0])
    print(info.columns)
    info["ID"] = info["id"].astype(str).str[:id_length]
    merged_mhc = tumor.merge(normal,how='left', on=['Pos','ID'])
    merged_mhc = merged_mhc.rename(columns={col: col.replace("_y","_normal") for col in merged_mhc.columns}).rename(columns={col: col.replace("_x","_tumor") for col in merged_mhc.columns})
    info = info.rename(columns={"gene_id":"Gene_ID","gene_name":"Gene_Symbol","strand":"Strand","positions":"Variant_Position","chrom":"Chromosome","somatic_aa_change":"Somatic_AminoAcid_Change"})
    merged_dataframe = merged_mhc.merge(info, how='left', on = 'ID')

    merged_dataframe["Peptide_tumor"] = merged_dataframe[["Peptide_tumor","Peptide_normal"]].apply(lambda x: diffEpitope(*x), axis=1)
    ## Are all possible variants in the peptide ("Cis") or not ("Trans")
    merged_dataframe["Variant_Orientation"] = "Cis"
    trans = merged_dataframe.nvariant_sites > merged_dataframe.nvar
    merged_dataframe.loc[trans, "Variant_Orientation"] = "Trans"

    ## check misssense/silent mutation status
    nonsilent = merged_dataframe.Peptide_tumor != merged_dataframe.Peptide_normal
    merged_dataframe = merged_dataframe[nonsilent]
    merged_dataframe = merged_dataframe.drop_duplicates(subset=["transcript","offset","Peptide_tumor","Somatic_AminoAcid_Change"])

    data = merged_dataframe[["ID","transcript","Gene_ID","Gene_Symbol","Chromosome","offset","freq","depth",
    "Somatic_AminoAcid_Change", "nvar", "nsomatic", "somatic_positions", "Peptide_tumor","NB_tumor","Rank_min_tumor","Aff_min_tumor",
    "Top_rank_HLA_tumor","Top_affinity_HLA_tumor","Peptide_normal","NB_normal",
    "Rank_min_normal","Aff_min_normal","Top_rank_HLA_normal","Top_affinity_HLA_normal"]]

    data.columns = ["ID","Transcript_ID","Gene_ID","Gene_Symbol","Chromosome","Position","Frequency","Read_Depth",
    "Somatic_AminoAcid_Change", "nvar", "nsomatic", "somatic_positions", "Peptide_tumor","BindingHLAs_tumor","Rank_min_tumor","Affinity_min_tumor",
    "Top_rank_HLA_tumor","Top_affinity_HLA_tumor","Peptide_normal","BindingHLAs_normal",
    "Rank_min_normal","Aff_min_normal","Top_rank_HLA_normal","Top_affinity_HLA_normal"]

    # data = data[data.BindingHLAs_tumor > 0]
    # data = data[(data.NB_normal.isna()) | (data.NB_normal == 0)]
    #data = data[(data.BindingHLAs_normal == 0)]

    ### Delete Stop-Codon including peptides
    data = data[data.Peptide_tumor.str.count("x") == 0]
    data = data[data.Peptide_tumor.str.count("X") == 0]
    data.sort_values(["Chromosome", "somatic_positions"], inplace=True)
    ### Remove Duplicate kmers
    data = data.drop_duplicates(["Transcript_ID", "Peptide_tumor", "Somatic_AminoAcid_Change", "Peptide_normal"])

    data.to_csv(outfile, index=False, sep = '\t')


## highlight the difference between mutated neopeptide and wildtype
def diffEpitope(e1,e2):
    if str(e2) == 'nan' or str(e2) == '':
        return(e1)
    e1 = str(e1)
    e2 = str(e2)
    diff_pos = [i for i in range(len(e1)) if e1[i] != e2[i]]
    e_new = e1
    e2_new = e2
    for p in diff_pos:
        e_new = e_new[:p] + e_new[p].lower() + e_new[p+1:]
        e2_new = e2_new[:p] + e2_new[p].lower() + e2_new[p+1:]
    return(e_new)


def main():
    info = pd.read_csv(snakemake.input[0], sep = '\t', dtype=str)
    tumor = pd.read_csv(snakemake.input[1], sep = '\t')
    normal = pd.read_csv(snakemake.input[2], sep = '\t')
    outfile = snakemake.output[0]
    merge(info, tumor, normal, outfile)

if __name__ == '__main__':
    sys.exit(main())
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
import pandas as pd

hlaI = ["A","B","C"]

hlaII = ["DRB1", "DPA1", "DPB1", "DQA1", "DQB1"]

hlas = pd.read_csv(snakemake.input[0], sep='\t')

hlasI = hlas[hlas.Locus.isin(hlaI)]
hlasI["Allele"]="HLA-" + hlasI.Allele.str.split(":", expand=True)[[0,1]].apply(lambda x: ''.join(x), axis=1).str.replace('*','')
hlasI = hlasI[["Allele"]].drop_duplicates()
hlasI.to_csv(snakemake.output[0], sep='\t', index=False)

hlasII = hlas[hlas.Locus.isin(hlaII)]
hlasII["HLA"] = hlasII.Locus.str[0:2]
hlasII["Allele"] = hlasII.Allele.str.split(":", expand=True)[[0,1]].apply(lambda x: ''.join(x), axis=1).str.replace('*','')

hlasII = pd.DataFrame("HLA-" + hlasII.groupby(["HLA","Chromosome"])["Allele"].apply(lambda x: "-".join(x)).reset_index()["Allele"]).drop_duplicates()
hlasII.loc[hlasII.Allele.str.contains("DRB"), "Allele"] = hlasII[hlasII.Allele.str.contains("DRB")]["Allele"].str.replace("HLA-DRB1","DRB1_")
hlasII.to_csv(snakemake.output[1], sep='\t', index=False)
1
2
3
4
5
6
7
8
from jinja2 import Template
import pandas as pd

with open(snakemake.input[0]) as template, open(snakemake.output[0], "w") as out:
    samples = snakemake.params.samples
    out.write(Template(template.read()).render(
        samples=samples[(samples["sample"] == snakemake.wildcards.pair) | (samples["sample"] == samples.loc[snakemake.wildcards.pair, "matched_normal"])]
    ))
1
2
3
4
5
6
7
import sys
import pandas as pd

sys.stderr = open(snakemake.log[0], "w")

data = pd.read_csv(snakemake.input.tsv, sep="\t")
data.to_excel(snakemake.output.xlsx, index=False)
ShowHide 97 more snippets with no or duplicated tags.

Login to post a comment if you would like to share your experience with this workflow.

Do you know this workflow well? If so, you can request seller status , and start supporting this workflow.

Free

Created: 1yr ago
Updated: 1yr ago
Maitainers: public
URL: https://github.com/snakemake-workflows/dna-seq-neoantigen-prediction
Name: dna-seq-neoantigen-prediction
Version: neopredict-lung-v0.1
Badge:
workflow icon

Insert copied code into your website to add a link to this workflow.

Downloaded: 0
Copyright: Public Domain
License: MIT License
  • Future updates

Related Workflows

cellranger-snakemake-gke
snakemake workflow to run cellranger on a given bucket using gke.
A Snakemake workflow for running cellranger on a given bucket using Google Kubernetes Engine. The usage of this workflow ...